1 /****************************************************************************** 2 3 UTF-8 URL decoder 4 5 Uses the glib 2.0, use 6 7 -Lglib-2.0 8 9 as linking parameter. 10 11 Copyright: 12 Copyright (c) 2009-2016 dunnhumby Germany GmbH. 13 All rights reserved. 14 15 License: 16 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details. 17 Alternatively, this file may be distributed under the terms of the Tango 18 3-Clause BSD License (see LICENSE_BSD.txt for details). 19 20 ******************************************************************************/ 21 22 module ocean.net.util.UrlDecoder; 23 24 import ocean.core.Verify; 25 import ocean.meta.types.Qualifiers; 26 import ocean.text.util.SplitIterator: ChrSplitIterator; 27 28 import core.stdc.string: memmove; 29 30 version (unittest) import ocean.core.Test; 31 32 extern (C) private 33 { 34 /************************************************************************** 35 36 Determines the numeric value of a character as a hexadecimal digit. 37 38 @see http://developer.gnome.org/glib/stable/glib-String-Utility-Functions.html#g-ascii-xdigit-value 39 40 Params: 41 c = an ASCII character. 42 43 Returns: 44 If c is a hex digit its numeric value. Otherwise, -1. 45 46 **************************************************************************/ 47 48 int g_ascii_xdigit_value (char c); 49 50 /************************************************************************** 51 52 Converts a single character to UTF-8. 53 54 @see http://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-unichar-to-utf8 55 56 Params: 57 c = a Unicode character code 58 outbuf = output buffer, must have at least 6 bytes of space. 59 If NULL, the length will be computed and returned and 60 nothing will be written to outbuf. 61 62 Returns: 63 number of bytes written 64 65 **************************************************************************/ 66 67 int g_unichar_to_utf8(dchar c, char* outbuf); 68 } 69 70 /****************************************************************************** 71 72 UrlDecoder class 73 74 Memory friendly, suitable for stack-allocated 'scope' instances. 75 76 ******************************************************************************/ 77 78 class UrlDecoder 79 { 80 /************************************************************************** 81 82 Source string, may be changed at any time except during decoding 83 'foreach' iteration. 84 85 **************************************************************************/ 86 87 public cstring source; 88 89 /************************************************************************** 90 91 Constructor 92 93 Params: 94 source_in = source string 95 96 **************************************************************************/ 97 98 public this ( cstring source_in = null ) 99 { 100 this.source = source_in; 101 } 102 103 /*************************************************************************** 104 105 Decodes this.source in an 'foreach' iteration over decoded chunks. 106 107 Checks whether the passed source string contains any characters encoded 108 according to the RFC 2396 escape format. (A '%' character followed by 109 two hexadecimal digits.) 110 111 The non-standard 4-digit unicode encoding scheme is also supported ("%u" 112 followed by four hex digits). Such characters are converted to UTF-8. 113 114 **************************************************************************/ 115 116 public int opApply ( scope int delegate ( ref cstring chunk ) dg ) 117 { 118 int callDg ( cstring str ) 119 { 120 return dg(str); 121 } 122 123 scope iterate_markers = new ChrSplitIterator('%'); 124 125 iterate_markers.include_remaining = false; 126 127 size_t first_marker = iterate_markers.reset(this.source).locateDelim(); 128 129 if (first_marker < this.source.length) 130 { 131 int result = callDg(this.source[0 .. first_marker]); 132 133 if (!result) foreach (ref pos, between; iterate_markers.reset(this.source[first_marker .. $])) 134 { 135 result = dg(between); 136 137 if (result) break; 138 139 auto remaining = iterate_markers.remaining; 140 141 char[6] decoded_buf; 142 size_t read_pos = 0; 143 144 auto decoded = decodeCharacter(decoded_buf, remaining, read_pos); 145 146 if (decoded.length) 147 { 148 verify(read_pos != 0); 149 150 auto original = this.source[0 .. read_pos]; 151 152 result = callDg(this.copyDecoded(decoded, original)? 153 decoded : original); 154 155 pos += read_pos; 156 } 157 else // decoding error 158 { 159 verify(!read_pos); 160 161 result = callDg("%"); 162 } 163 164 if (result) break; 165 } 166 167 return result? result : callDg(iterate_markers.remaining); 168 } 169 else 170 { 171 return dg(this.source); 172 } 173 } 174 175 /*************************************************************************** 176 177 Extracts a single character from the specified position in the passed 178 string, which is expected to be the index of a character preceded by a 179 '%'. 180 source[pos .. $] is scanned to see if they represent an encoded 181 character in either the RFC 2396 escape format (%XX) or the non-standard 182 escape format (%uXXXX) or if they should represent a '%' (%%). 183 184 (See: http://en.wikipedia.org/wiki/Percent-encoding) 185 186 On success the extracted character is written as utf8 into the provided 187 output buffer and pos is increased to the index right after the last 188 consumed character in source. On failure pos remains unchanged. 189 190 Params: 191 dst = string buffer to receive decoded characters 192 source = character string to decode a character from; may be 193 empty or null which will result in failure 194 pos = position in source 195 196 Returns: 197 a slice to the UTF-8 representation of the decoded character in dst 198 on success or an empty string on failure. The returned string is 199 guaranteed to slice dst from dst[0]. 200 201 ***************************************************************************/ 202 203 public static mstring decodeCharacter ( mstring dst, cstring source, ref size_t pos ) 204 out (slice) 205 { 206 assert (slice.ptr is dst.ptr, typeof (this).stringof ~ ".decodeCharacter: bad returned slice"); 207 assert(pos <= source.length, typeof (this).stringof ~ ".decodeCharacter (out): offset out of array bounds"); 208 } 209 do 210 { 211 verify( 212 pos <= source.length, 213 typeof (this).stringof ~ 214 ".decodeCharacter (in): offset out of array bounds" 215 ); 216 217 auto src = source[pos .. $]; 218 219 size_t read = 0, 220 written = 0; 221 222 if (src.length) switch (src[0]) 223 { 224 default: 225 if (src.length >= 2) 226 { 227 written = hex2(src[0], src[1], dst[0]); 228 229 if (written) 230 { 231 read = 2; 232 } 233 } 234 break; 235 236 case 'u': 237 if (src.length >= 5) 238 { 239 written = hex4(src[1 .. 5], dst).length; 240 241 if (written) 242 { 243 read = 5; 244 } 245 } 246 break; 247 248 case '%': 249 read = 1; 250 written = 1; 251 dst[0] = src[0]; 252 } 253 254 pos += read; 255 256 return dst[0 .. written]; 257 } 258 259 /*************************************************************************** 260 261 Decodes '%' encoded characters in str, replacing them in-place. 262 263 Checks whether the passed source string contains any characters encoded 264 according to the RFC 2396 escape format. (A '%' character followed by 265 two hexadecimal digits.) 266 267 The non-standard 4-digit unicode encoding scheme is also supported ("%u" 268 followed by four hex digits). Such characters are converted to UTF-8. 269 270 Note that the original content in str is overwritten with the decoded 271 content. The resulting content is at most as long as the original. The 272 returned string slices the valid content in str. str itself may contain 273 tailing junk. 274 275 Params: 276 str = string to decode 277 278 Returns: 279 the decoded str content (slices str from the beginning) 280 281 Out: 282 The returned array slices str from the beginning. 283 284 ***************************************************************************/ 285 286 public static mstring decode ( mstring str ) 287 out (str_out) 288 { 289 assert (str_out.ptr is str.ptr); 290 } 291 do 292 { 293 size_t pos = 0; 294 295 if (str.length) 296 { 297 scope iterator = new ChrSplitIterator('%'); 298 299 // Skip the beginning of str before the first '%'. 300 301 foreach (chunk; iterator.reset(str)) 302 { 303 pos = chunk.length; 304 break; 305 } 306 307 bool had_percent = false; 308 309 foreach (chunk; iterator) 310 { 311 size_t read, written = 0; 312 313 if (chunk.length) 314 { 315 if (chunk[0] == 'u') 316 { 317 // Have a 'u': Assume four hex digits follow which denote 318 // the character value; decode that character and copy the 319 // UTF-8 sequence into str, starting from pos. Note that 320 // since g_unichar_to_utf8() produces UTF-8 sequence of 6 321 // bytes maximum, the UTF-8 sequence won't be longer than 322 // the original "%u####" sequence. 323 324 read = 5; 325 if (chunk.length >= read) 326 { 327 written = hex4(chunk[1 .. read], str[pos .. pos + 6]).length; 328 } 329 } 330 else 331 { 332 // Assume two hex digits follow which denote the character 333 // value; replace str[pos] with the corresponding character. 334 335 read = 2; 336 if (chunk.length >= read) 337 { 338 written = hex2(chunk[0], chunk[1], str[pos]); 339 } 340 } 341 } 342 else 343 { 344 if (had_percent) 345 { 346 had_percent = false; 347 } 348 else 349 { 350 str[pos++] = '%'; 351 had_percent = true; 352 } 353 354 continue; 355 } 356 357 verify(written <= read); 358 359 // written = 0 => error: Pass through the erroneous sequence, 360 // prepending the '%' that was skipped by the iterator. 361 362 if (!written) 363 { 364 if (had_percent) 365 { 366 had_percent = false; 367 } 368 else 369 { 370 str[pos] = '%'; 371 written = 1; 372 had_percent = true; 373 } 374 375 read = 0; 376 } 377 378 pos += written; 379 380 // Move the rest of chunk to the front. 381 382 if (chunk.length > read) 383 { 384 cstring between = chunk[read .. $]; 385 386 memmove(&str[pos], &between[0], between.length); 387 388 pos += between.length; 389 } 390 391 had_percent = false; 392 } 393 } 394 395 return str[0 .. pos]; 396 } 397 398 /*************************************************************************** 399 400 Creates a character c with the value specified by the 2-digit ASCII 401 hexadecimal number whose digits are hi and lo. For example, if 402 hi = 'E' or 'e' and lo = '9', c will be 0xE9. 403 404 Params: 405 hi = most significant hexadecimal digit (ASCII) 406 lo = least significant hexadecimal digit (ASCII) 407 c = output character 408 409 Returns: 410 true on success or false if hi or lo or both are not a hexadecimal 411 digit. 412 413 ***************************************************************************/ 414 415 static bool hex2 ( char hi, char lo, out char c ) 416 { 417 int xhi = g_ascii_xdigit_value(hi), 418 xlo = g_ascii_xdigit_value(lo); 419 420 if (xhi >= 0 && xlo >= 0) 421 { 422 c = cast(char) ((xhi << 4) | xlo); 423 424 return true; 425 } 426 else 427 { 428 return false; 429 } 430 } 431 432 /*************************************************************************** 433 434 Converts hex, which is expected to contain a 4-digit ASCII hexadecimal 435 number, into its corresponding UTF-8 character sequence. 436 437 Params: 438 hex = character code in hexadecimal representation (ASCII) 439 utf8_buf = destination buffer for the UTF-8 sequence of the 440 character; the length must be at least 6; may contain 441 tailing junk if the sequence is actually shorter 442 443 Returns: 444 the UTF-8 sequence (slices the valid data in utf8_buf) on success or 445 an empty string on failure. 446 447 In: 448 - hex.length must be 4, 449 - utf8_buf.length must at least be 6. 450 451 Out: 452 The returned string slices utf8_buf from the beginning. 453 454 ***************************************************************************/ 455 456 static mstring hex4 ( cstring hex, mstring utf8_buf ) 457 out (utf8) 458 { 459 assert (utf8_buf.ptr is utf8.ptr); 460 } 461 do 462 { 463 verify (hex.length == 4); 464 verify (utf8_buf.length >= 6); 465 466 int hihi = g_ascii_xdigit_value(hex[0]), 467 hilo = g_ascii_xdigit_value(hex[1]), 468 lohi = g_ascii_xdigit_value(hex[2]), 469 lolo = g_ascii_xdigit_value(hex[3]); 470 471 size_t n = 0; 472 473 if (hihi >= 0 && hilo >= 0 && lohi >= 0 && lolo >= 0) 474 { 475 dchar c = ((cast (dchar) hihi) << 0xC) | 476 ((cast (dchar) hilo) << 0x8) | 477 ((cast (dchar) lohi) << 0x4) | 478 ((cast (dchar) lolo)); 479 480 n = cast (size_t) g_unichar_to_utf8(c, utf8_buf.ptr); 481 } 482 483 return utf8_buf[0 .. n]; 484 } 485 486 /************************************************************************** 487 488 To be overridden as an option, called by opApply(). 489 490 Determines whether each decoded character should be passed as 'foreach' 491 iteration variable string in its decoded or its original (encoded) form. 492 This can be used in cases where the decoding of only certain characters 493 is desired. 494 495 By default always the decoded form is selected. 496 497 Params: 498 decoded = decoded form of the character 499 original = original (encoded) form 500 501 Returns: 502 true to use the decoded or false to use the original (encoded) form. 503 504 **************************************************************************/ 505 506 protected bool copyDecoded ( cstring decoded, cstring original ) 507 { 508 return true; 509 } 510 } 511 512 513 unittest 514 { 515 scope decoder = new UrlDecoder("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207%"), 516 decoded = new char[0]; 517 518 foreach (chunk; decoder) 519 { 520 decoded ~= chunk; 521 } 522 523 test (decoded == "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇%"); 524 525 test (UrlDecoder.decode("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207".dup) == 526 "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇"); 527 }