1 /****************************************************************************** 2 3 UTF-8 URL decoder 4 5 Uses the glib 2.0, use 6 7 -Lglib-2.0 8 9 as linking parameter. 10 11 Copyright: 12 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 13 All rights reserved. 14 15 License: 16 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 17 Alternatively, this file may be distributed under the terms of the Tango 18 3-Clause BSD License (see LICENSE_BSD.txt for details). 19 20 ******************************************************************************/ 21 22 module ocean.net.util.UrlDecoder; 23 24 25 import ocean.transition; 26 27 import ocean.core.Verify; 28 29 import ocean.text.util.SplitIterator: ChrSplitIterator; 30 31 import ocean.stdc.string: memmove; 32 33 version(UnitTest) import ocean.core.Test; 34 35 extern (C) private 36 { 37 /************************************************************************** 38 39 Determines the numeric value of a character as a hexadecimal digit. 40 41 @see http://developer.gnome.org/glib/stable/glib-String-Utility-Functions.html#g-ascii-xdigit-value 42 43 Params: 44 c = an ASCII character. 45 46 Returns: 47 If c is a hex digit its numeric value. Otherwise, -1. 48 49 **************************************************************************/ 50 51 int g_ascii_xdigit_value (char c); 52 53 /************************************************************************** 54 55 Converts a single character to UTF-8. 56 57 @see http://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-unichar-to-utf8 58 59 Params: 60 c = a Unicode character code 61 outbuf = output buffer, must have at least 6 bytes of space. 62 If NULL, the length will be computed and returned and 63 nothing will be written to outbuf. 64 65 Returns: 66 number of bytes written 67 68 **************************************************************************/ 69 70 int g_unichar_to_utf8(dchar c, char* outbuf); 71 } 72 73 /****************************************************************************** 74 75 UrlDecoder class 76 77 Memory friendly, suitable for stack-allocated 'scope' instances. 78 79 ******************************************************************************/ 80 81 class UrlDecoder 82 { 83 /************************************************************************** 84 85 Source string, may be changed at any time except during decoding 86 'foreach' iteration. 87 88 **************************************************************************/ 89 90 public cstring source; 91 92 /************************************************************************** 93 94 Constructor 95 96 Params: 97 source_in = source string 98 99 **************************************************************************/ 100 101 public this ( cstring source_in = null ) 102 { 103 this.source = source_in; 104 } 105 106 /*************************************************************************** 107 108 Decodes this.source in an 'foreach' iteration over decoded chunks. 109 110 Checks whether the passed source string contains any characters encoded 111 according to the RFC 2396 escape format. (A '%' character followed by 112 two hexadecimal digits.) 113 114 The non-standard 4-digit unicode encoding scheme is also supported ("%u" 115 followed by four hex digits). Such characters are converted to UTF-8. 116 117 **************************************************************************/ 118 119 public int opApply ( scope int delegate ( ref cstring chunk ) dg ) 120 { 121 int callDg ( cstring str ) 122 { 123 return dg(str); 124 } 125 126 scope iterate_markers = new ChrSplitIterator('%'); 127 128 iterate_markers.include_remaining = false; 129 130 size_t first_marker = iterate_markers.reset(this.source).locateDelim(); 131 132 if (first_marker < this.source.length) 133 { 134 int result = callDg(this.source[0 .. first_marker]); 135 136 if (!result) foreach (ref pos, between; iterate_markers.reset(this.source[first_marker .. $])) 137 { 138 result = dg(between); 139 140 if (result) break; 141 142 auto remaining = iterate_markers.remaining; 143 144 char[6] decoded_buf; 145 size_t read_pos = 0; 146 147 auto decoded = decodeCharacter(decoded_buf, remaining, read_pos); 148 149 if (decoded.length) 150 { 151 verify(read_pos != 0); 152 153 auto original = this.source[0 .. read_pos]; 154 155 result = callDg(this.copyDecoded(decoded, original)? 156 decoded : original); 157 158 pos += read_pos; 159 } 160 else // decoding error 161 { 162 verify(!read_pos); 163 164 result = callDg("%"); 165 } 166 167 if (result) break; 168 } 169 170 return result? result : callDg(iterate_markers.remaining); 171 } 172 else 173 { 174 return dg(this.source); 175 } 176 } 177 178 /*************************************************************************** 179 180 Extracts a single character from the specified position in the passed 181 string, which is expected to be the index of a character preceded by a 182 '%'. 183 source[pos .. $] is scanned to see if they represent an encoded 184 character in either the RFC 2396 escape format (%XX) or the non-standard 185 escape format (%uXXXX) or if they should represent a '%' (%%). 186 187 (See: http://en.wikipedia.org/wiki/Percent-encoding) 188 189 On success the extracted character is written as utf8 into the provided 190 output buffer and pos is increased to the index right after the last 191 consumed character in source. On failure pos remains unchanged. 192 193 Params: 194 dst = string buffer to receive decoded characters 195 source = character string to decode a character from; may be 196 empty or null which will result in failure 197 pos = position in source 198 199 Returns: 200 a slice to the UTF-8 representation of the decoded character in dst 201 on success or an empty string on failure. The returned string is 202 guaranteed to slice dst from dst[0]. 203 204 ***************************************************************************/ 205 206 public static mstring decodeCharacter ( mstring dst, cstring source, ref size_t pos ) 207 out (slice) 208 { 209 assert (slice.ptr is dst.ptr, typeof (this).stringof ~ ".decodeCharacter: bad returned slice"); 210 assert(pos <= source.length, typeof (this).stringof ~ ".decodeCharacter (out): offset out of array bounds"); 211 } 212 body 213 { 214 verify( 215 pos <= source.length, 216 typeof (this).stringof ~ 217 ".decodeCharacter (in): offset out of array bounds" 218 ); 219 220 auto src = source[pos .. $]; 221 222 size_t read = 0, 223 written = 0; 224 225 if (src.length) switch (src[0]) 226 { 227 default: 228 if (src.length >= 2) 229 { 230 written = hex2(src[0], src[1], dst[0]); 231 232 if (written) 233 { 234 read = 2; 235 } 236 } 237 break; 238 239 case 'u': 240 if (src.length >= 5) 241 { 242 written = hex4(src[1 .. 5], dst).length; 243 244 if (written) 245 { 246 read = 5; 247 } 248 } 249 break; 250 251 case '%': 252 read = 1; 253 written = 1; 254 dst[0] = src[0]; 255 } 256 257 pos += read; 258 259 return dst[0 .. written]; 260 } 261 262 /*************************************************************************** 263 264 Decodes '%' encoded characters in str, replacing them in-place. 265 266 Checks whether the passed source string contains any characters encoded 267 according to the RFC 2396 escape format. (A '%' character followed by 268 two hexadecimal digits.) 269 270 The non-standard 4-digit unicode encoding scheme is also supported ("%u" 271 followed by four hex digits). Such characters are converted to UTF-8. 272 273 Note that the original content in str is overwritten with the decoded 274 content. The resulting content is at most as long as the original. The 275 returned string slices the valid content in str. str itself may contain 276 tailing junk. 277 278 Params: 279 str = string to decode 280 281 Returns: 282 the decoded str content (slices str from the beginning) 283 284 Out: 285 The returned array slices str from the beginning. 286 287 ***************************************************************************/ 288 289 public static mstring decode ( mstring str ) 290 out (str_out) 291 { 292 assert (str_out.ptr is str.ptr); 293 } 294 body 295 { 296 size_t pos = 0; 297 298 if (str.length) 299 { 300 scope iterator = new ChrSplitIterator('%'); 301 302 // Skip the beginning of str before the first '%'. 303 304 foreach (chunk; iterator.reset(str)) 305 { 306 pos = chunk.length; 307 break; 308 } 309 310 bool had_percent = false; 311 312 foreach (chunk; iterator) 313 { 314 size_t read, written = 0; 315 316 if (chunk.length) 317 { 318 if (chunk[0] == 'u') 319 { 320 // Have a 'u': Assume four hex digits follow which denote 321 // the character value; decode that character and copy the 322 // UTF-8 sequence into str, starting from pos. Note that 323 // since g_unichar_to_utf8() produces UTF-8 sequence of 6 324 // bytes maximum, the UTF-8 sequence won't be longer than 325 // the original "%u####" sequence. 326 327 read = 5; 328 if (chunk.length >= read) 329 { 330 written = hex4(chunk[1 .. read], str[pos .. pos + 6]).length; 331 } 332 } 333 else 334 { 335 // Assume two hex digits follow which denote the character 336 // value; replace str[pos] with the corresponding character. 337 338 read = 2; 339 if (chunk.length >= read) 340 { 341 written = hex2(chunk[0], chunk[1], str[pos]); 342 } 343 } 344 } 345 else 346 { 347 if (had_percent) 348 { 349 had_percent = false; 350 } 351 else 352 { 353 str[pos++] = '%'; 354 had_percent = true; 355 } 356 357 continue; 358 } 359 360 verify(written <= read); 361 362 // written = 0 => error: Pass through the erroneous sequence, 363 // prepending the '%' that was skipped by the iterator. 364 365 if (!written) 366 { 367 if (had_percent) 368 { 369 had_percent = false; 370 } 371 else 372 { 373 str[pos] = '%'; 374 written = 1; 375 had_percent = true; 376 } 377 378 read = 0; 379 } 380 381 pos += written; 382 383 // Move the rest of chunk to the front. 384 385 if (chunk.length > read) 386 { 387 cstring between = chunk[read .. $]; 388 389 memmove(&str[pos], &between[0], between.length); 390 391 pos += between.length; 392 } 393 394 had_percent = false; 395 } 396 } 397 398 return str[0 .. pos]; 399 } 400 401 /*************************************************************************** 402 403 Creates a character c with the value specified by the 2-digit ASCII 404 hexadecimal number whose digits are hi and lo. For example, if 405 hi = 'E' or 'e' and lo = '9', c will be 0xE9. 406 407 Params: 408 hi = most significant hexadecimal digit (ASCII) 409 lo = least significant hexadecimal digit (ASCII) 410 c = output character 411 412 Returns: 413 true on success or false if hi or lo or both are not a hexadecimal 414 digit. 415 416 ***************************************************************************/ 417 418 static bool hex2 ( char hi, char lo, out char c ) 419 { 420 int xhi = g_ascii_xdigit_value(hi), 421 xlo = g_ascii_xdigit_value(lo); 422 423 if (xhi >= 0 && xlo >= 0) 424 { 425 c = cast(char) ((xhi << 4) | xlo); 426 427 return true; 428 } 429 else 430 { 431 return false; 432 } 433 } 434 435 /*************************************************************************** 436 437 Converts hex, which is expected to contain a 4-digit ASCII hexadecimal 438 number, into its corresponding UTF-8 character sequence. 439 440 Params: 441 hex = character code in hexadecimal representation (ASCII) 442 utf8_buf = destination buffer for the UTF-8 sequence of the 443 character; the length must be at least 6; may contain 444 tailing junk if the sequence is actually shorter 445 446 Returns: 447 the UTF-8 sequence (slices the valid data in utf8_buf) on success or 448 an empty string on failure. 449 450 In: 451 - hex.length must be 4, 452 - utf8_buf.length must at least be 6. 453 454 Out: 455 The returned string slices utf8_buf from the beginning. 456 457 ***************************************************************************/ 458 459 static mstring hex4 ( cstring hex, mstring utf8_buf ) 460 out (utf8) 461 { 462 assert (utf8_buf.ptr is utf8.ptr); 463 } 464 body 465 { 466 verify (hex.length == 4); 467 verify (utf8_buf.length >= 6); 468 469 int hihi = g_ascii_xdigit_value(hex[0]), 470 hilo = g_ascii_xdigit_value(hex[1]), 471 lohi = g_ascii_xdigit_value(hex[2]), 472 lolo = g_ascii_xdigit_value(hex[3]); 473 474 size_t n = 0; 475 476 if (hihi >= 0 && hilo >= 0 && lohi >= 0 && lolo >= 0) 477 { 478 dchar c = ((cast (dchar) hihi) << 0xC) | 479 ((cast (dchar) hilo) << 0x8) | 480 ((cast (dchar) lohi) << 0x4) | 481 ((cast (dchar) lolo)); 482 483 n = cast (size_t) g_unichar_to_utf8(c, utf8_buf.ptr); 484 } 485 486 return utf8_buf[0 .. n]; 487 } 488 489 /************************************************************************** 490 491 To be overridden as an option, called by opApply(). 492 493 Determines whether each decoded character should be passed as 'foreach' 494 iteration variable string in its decoded or its original (encoded) form. 495 This can be used in cases where the decoding of only certain characters 496 is desired. 497 498 By default always the decoded form is selected. 499 500 Params: 501 decoded = decoded form of the character 502 original = original (encoded) form 503 504 Returns: 505 true to use the decoded or false to use the original (encoded) form. 506 507 **************************************************************************/ 508 509 protected bool copyDecoded ( cstring decoded, cstring original ) 510 { 511 return true; 512 } 513 } 514 515 516 unittest 517 { 518 scope decoder = new UrlDecoder("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207%"), 519 decoded = new char[0]; 520 521 foreach (chunk; decoder) 522 { 523 decoded ~= chunk; 524 } 525 526 test (decoded == "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇%"); 527 528 test (UrlDecoder.decode("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207".dup) == 529 "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇"); 530 }