1 /******************************************************************************* 2 3 Functions to convert non-ASCII and characters reserved in URLs to percent 4 encoded form. 5 6 Copyright: 7 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 8 All rights reserved. 9 10 License: 11 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 12 Alternatively, this file may be distributed under the terms of the Tango 13 3-Clause BSD License (see LICENSE_BSD.txt for details). 14 15 *******************************************************************************/ 16 17 module ocean.net.util.UrlEncoder; 18 19 20 import ocean.meta.types.Qualifiers; 21 import ocean.core.Verify; 22 import core.stdc.ctype: isgraph; 23 24 version (unittest) import ocean.core.Test; 25 26 /****************************************************************************** 27 28 Converts non-unreserved characters. Unreserved characters are the ASCII 29 alphanumeric characters and 30 31 -._~ 32 33 . 34 35 @see http://tools.ietf.org/html/rfc3986#section-2.3 36 37 Special cases: 38 39 - The whitespace character 0x20 is encoded as "%20" (not "+"). 40 - Characters below 0x20 and above 0x7E are encoded straight away, regardless 41 of any encoding or codepage. For example, the UTF-8 encoded string 42 "Münzstraße", which corresponds to the byte sequence 43 [0x4D, 0xC3, 0xBC, 0x6E, 0x7A, 0x73, 0x74, 0x72, 0x61, 0xC3, 0x9F, 0x65] 44 ...M .........ü ....n ...z ...s ...t ...r ...a .........ß ...e 45 , is encoded as "M%C3%BCnzstra%C3%9Fe". 46 47 UrlEncoder class 48 49 Memory friendly, suitable for stack-allocated 'scope' instances. 50 51 ******************************************************************************/ 52 53 class EncodeNonUnreserved : PercentEncoder 54 { 55 /************************************************************************** 56 57 Character map, true for unreserved characters. 58 59 **************************************************************************/ 60 61 static immutable bool[char.max + 1] unreserved = 62 [ 63 'A': true, 'B': true, 'C': true, 'D': true, 'E': true, 'F': true, 64 'G': true, 'H': true, 'I': true, 'J': true, 'K': true, 'L': true, 65 'M': true, 'N': true, 'O': true, 'P': true, 'Q': true, 'R': true, 66 'S': true, 'T': true, 'U': true, 'V': true, 'W': true, 'X': true, 67 'Y': true, 'Z': true, 68 'a': true, 'b': true, 'c': true, 'd': true, 'e': true, 'f': true, 69 'g': true, 'h': true, 'i': true, 'j': true, 'k': true, 'l': true, 70 'm': true, 'n': true, 'o': true, 'p': true, 'q': true, 'r': true, 71 's': true, 't': true, 'u': true, 'v': true, 'w': true, 'x': true, 72 'y': true, 'z': true, 73 '0': true, '1': true, '2': true, '3': true, '4': true, '5': true, 74 '6': true, '7': true, '8': true, '9': true, 75 '-': true, '_': true, '.': true, '~': true 76 ]; 77 78 /************************************************************************** 79 80 Constructor 81 82 Params: 83 source_in = source string 84 85 **************************************************************************/ 86 87 public this ( cstring source_in = null ) 88 { 89 super(source_in); 90 } 91 92 /************************************************************************** 93 94 Tells whether c should be converted. 95 96 Params: 97 c = character in question 98 99 Returns: 100 true if c should be converted or false otherwise. 101 102 **************************************************************************/ 103 104 protected override bool encode ( char c ) 105 { 106 return !this.unreserved[c]; 107 } 108 } 109 110 /****************************************************************************** 111 112 Encodes all characters except the ASCII graphic, that is, encodes ASCII 113 whitespace and control characters and non-ASCII characters. 114 115 ******************************************************************************/ 116 117 class EncodeExceptAsciiGraph : PercentEncoder 118 { 119 /************************************************************************** 120 121 Constructor 122 123 Params: 124 source_in = source string 125 126 **************************************************************************/ 127 128 public this ( cstring source_in = null ) 129 { 130 super(source_in); 131 } 132 133 /************************************************************************** 134 135 Tells whether c should be converted. 136 137 Params: 138 c = character in question 139 140 Returns: 141 true if c should be converted or false otherwise. 142 143 **************************************************************************/ 144 145 protected override bool encode ( char c ) 146 { 147 return !.isgraph(c); 148 } 149 } 150 151 /****************************************************************************** 152 153 Abstract encoder 154 155 ******************************************************************************/ 156 157 class PercentEncoder 158 { 159 /************************************************************************** 160 161 Source string, may be changed at any time except during encoding 162 'foreach' iteration. 163 164 **************************************************************************/ 165 166 public cstring source; 167 168 /************************************************************************** 169 170 Constructor 171 172 Params: 173 source_in = source string 174 175 **************************************************************************/ 176 177 public this ( cstring source_in = null ) 178 { 179 this.source = source_in; 180 } 181 182 /************************************************************************** 183 184 Encodes this.source in an 'foreach' iteration over encoded chunks. 185 Each chunk is guaranteed not to be empty. 186 187 **************************************************************************/ 188 189 public int opApply ( scope int delegate ( ref cstring chunk ) dg ) 190 { 191 int result = 0; 192 193 int callDg ( cstring chunk ) 194 { 195 return result = dg(chunk); 196 } 197 198 size_t start = 0; 199 char[3] hex; 200 201 hex[0] = '%'; 202 203 foreach (i, c; this.source) 204 { 205 if (this.encode(c)) 206 { 207 verify(start <= i); 208 209 if (start < i) 210 { 211 if (callDg(this.source[start .. i])) return result; 212 } 213 214 static immutable hex_digits = "0123456789ABCDEF"; 215 216 hex[1] = hex_digits [(c >> 4) & 0xF]; 217 hex[2] = hex_digits [c & 0xF]; 218 219 if (callDg(hex)) return result; 220 221 start = i + 1; 222 } 223 } 224 225 verify(start <= this.source.length); 226 227 return (start < this.source.length)? 228 callDg(this.source[start .. $]) : result; 229 } 230 231 /****************************************************************************** 232 233 Encodes all characters except the ASCII graphic, that is, encodes ASCII 234 whitespace and control characters and non-ASCII characters. 235 236 ******************************************************************************/ 237 238 protected abstract bool encode ( char c ); 239 } 240 241 242 unittest 243 { 244 static void checkRange ( char first, char last ) 245 { 246 for (char c = first; c <= last; c++) 247 { 248 test (EncodeNonUnreserved.unreserved[c], 249 "'" ~ c ~ "' is supposed to be unreserved"); 250 } 251 } 252 253 checkRange('A', 'Z'); 254 checkRange('a', 'z'); 255 checkRange('0', '9'); 256 257 foreach (c; "-_.~") 258 { 259 test (EncodeNonUnreserved.unreserved[c], 260 "'" ~ c ~ "' is supposed to be unreserved"); 261 } 262 263 scope encoder = new EncodeNonUnreserved("For example, the octet " 264 ~ "corresponding to the tilde (\"~\") character is often encoded as " 265 ~ "\"%7E\" by older URI processing implementations; the \"%7E\" can be " 266 ~ "replaced by \"~\" without chänging its interpretation."); 267 268 static immutable istring[] chunks = 269 [ 270 "For", "%20", "example", "%2C", "%20", "the", "%20", "octet", "%20", 271 "corresponding","%20", "to", "%20", "the", "%20", "tilde", "%20", 272 "%28", "%22", "~", "%22", "%29", "%20", "character", "%20", "is", 273 "%20", "often", "%20", "encoded", "%20", "as", "%20", "%22", "%25", 274 "7E", "%22", "%20", "by", "%20", "older", "%20", "URI", "%20", 275 "processing", "%20", "implementations", "%3B", "%20", "the", "%20", 276 "%22", "%25", "7E", "%22", "%20", "can", "%20", "be", "%20", 277 "replaced", "%20", "by", "%20", "%22", "~", "%22", "%20", "without", 278 "%20", "ch", "%C3", "%A4", "nging", "%20", "its", "%20", 279 "interpretation." 280 ]; 281 282 size_t i = 0; 283 284 foreach (chunk; encoder) 285 { 286 test (i < chunks.length); 287 test (chunks[i++] == chunk); 288 } 289 }