1 /******************************************************************************
2 
3     UTF-8 URL decoder
4 
5     Uses the glib 2.0, use
6 
7         -Lglib-2.0
8 
9     as linking parameter.
10 
11     Copyright:
12         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
13         All rights reserved.
14 
15     License:
16         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
17         Alternatively, this file may be distributed under the terms of the Tango
18         3-Clause BSD License (see LICENSE_BSD.txt for details).
19 
20  ******************************************************************************/
21 
22 module ocean.net.util.UrlDecoder;
23 
24 import ocean.core.Verify;
25 import ocean.meta.types.Qualifiers;
26 import ocean.text.util.SplitIterator: ChrSplitIterator;
27 
28 import core.stdc.string: memmove;
29 
30 version (unittest) import ocean.core.Test;
31 
32 extern (C) private
33 {
34     /**************************************************************************
35 
36         Determines the numeric value of a character as a hexadecimal digit.
37 
38         @see http://developer.gnome.org/glib/stable/glib-String-Utility-Functions.html#g-ascii-xdigit-value
39 
40         Params:
41             c = an ASCII character.
42 
43         Returns:
44             If c is a hex digit its numeric value. Otherwise, -1.
45 
46      **************************************************************************/
47 
48     int   g_ascii_xdigit_value (char c);
49 
50     /**************************************************************************
51 
52         Converts a single character to UTF-8.
53 
54         @see http://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-unichar-to-utf8
55 
56         Params:
57             c      = a Unicode character code
58             outbuf = output buffer, must have at least 6 bytes of space.
59                      If NULL, the length will be computed and returned and
60                      nothing will be written to outbuf.
61 
62         Returns:
63             number of bytes written
64 
65      **************************************************************************/
66 
67     int g_unichar_to_utf8(dchar c, char* outbuf);
68 }
69 
70 /******************************************************************************
71 
72     UrlDecoder class
73 
74     Memory friendly, suitable for stack-allocated 'scope' instances.
75 
76  ******************************************************************************/
77 
78 class UrlDecoder
79 {
80     /**************************************************************************
81 
82         Source string, may be changed at any time except during decoding
83         'foreach' iteration.
84 
85      **************************************************************************/
86 
87     public cstring source;
88 
89     /**************************************************************************
90 
91         Constructor
92 
93         Params:
94             source_in = source string
95 
96      **************************************************************************/
97 
98     public this ( cstring source_in = null )
99     {
100         this.source = source_in;
101     }
102 
103     /***************************************************************************
104 
105         Decodes this.source in an 'foreach' iteration over decoded chunks.
106 
107         Checks whether the passed source string contains any characters encoded
108         according to the RFC 2396 escape format. (A '%' character followed by
109         two hexadecimal digits.)
110 
111         The non-standard 4-digit unicode encoding scheme is also supported ("%u"
112         followed by four hex digits). Such characters are converted to UTF-8.
113 
114     **************************************************************************/
115 
116     public int opApply ( scope int delegate ( ref cstring chunk ) dg )
117     {
118         int callDg ( cstring str )
119         {
120             return dg(str);
121         }
122 
123         scope iterate_markers = new ChrSplitIterator('%');
124 
125         iterate_markers.include_remaining = false;
126 
127         size_t first_marker = iterate_markers.reset(this.source).locateDelim();
128 
129         if (first_marker < this.source.length)
130         {
131             int result = callDg(this.source[0 .. first_marker]);
132 
133             if (!result) foreach (ref pos, between; iterate_markers.reset(this.source[first_marker .. $]))
134             {
135                 result = dg(between);
136 
137                 if (result) break;
138 
139                 auto remaining = iterate_markers.remaining;
140 
141                 char[6] decoded_buf;
142                 size_t read_pos = 0;
143 
144                 auto decoded = decodeCharacter(decoded_buf, remaining, read_pos);
145 
146                 if (decoded.length)
147                 {
148                     verify(read_pos != 0);
149 
150                     auto original = this.source[0 .. read_pos];
151 
152                     result = callDg(this.copyDecoded(decoded, original)?
153                                         decoded : original);
154 
155                     pos += read_pos;
156                 }
157                 else                                           // decoding error
158                 {
159                     verify(!read_pos);
160 
161                     result = callDg("%");
162                 }
163 
164                 if (result) break;
165             }
166 
167             return result? result : callDg(iterate_markers.remaining);
168         }
169         else
170         {
171             return dg(this.source);
172         }
173     }
174 
175     /***************************************************************************
176 
177         Extracts a single character from the specified position in the passed
178         string, which is expected to be the index of a character preceded by a
179         '%'.
180         source[pos .. $] is scanned to see if they represent an encoded
181         character in either the RFC 2396 escape format (%XX) or the non-standard
182         escape format (%uXXXX) or if they should represent a '%' (%%).
183 
184         (See: http://en.wikipedia.org/wiki/Percent-encoding)
185 
186         On success the extracted character is written as utf8 into the provided
187         output buffer and pos is increased to the index right after the last
188         consumed character in source. On failure pos remains unchanged.
189 
190         Params:
191             dst    = string buffer to receive decoded characters
192             source = character string to decode a character from; may be
193                      empty or null which will result in failure
194             pos    = position in source
195 
196         Returns:
197             a slice to the UTF-8 representation of the decoded character in dst
198             on success or an empty string on failure. The returned string is
199             guaranteed to slice dst from dst[0].
200 
201     ***************************************************************************/
202 
203     public static mstring decodeCharacter ( mstring dst, cstring source, ref size_t pos )
204     out (slice)
205     {
206         assert (slice.ptr is dst.ptr, typeof (this).stringof ~ ".decodeCharacter: bad returned slice");
207         assert(pos <= source.length, typeof (this).stringof ~ ".decodeCharacter (out): offset out of array bounds");
208     }
209     do
210     {
211         verify(
212             pos <= source.length,
213             typeof (this).stringof ~
214                 ".decodeCharacter (in): offset out of array bounds"
215         );
216 
217         auto src = source[pos .. $];
218 
219         size_t read    = 0,
220                written = 0;
221 
222         if (src.length) switch (src[0])
223         {
224             default:
225                 if (src.length >= 2)
226                 {
227                     written = hex2(src[0], src[1], dst[0]);
228 
229                     if (written)
230                     {
231                         read = 2;
232                     }
233                 }
234                 break;
235 
236             case 'u':
237                 if (src.length >= 5)
238                 {
239                     written = hex4(src[1 .. 5], dst).length;
240 
241                     if (written)
242                     {
243                         read = 5;
244                     }
245                 }
246                 break;
247 
248             case '%':
249                 read  = 1;
250                 written = 1;
251                 dst[0] = src[0];
252         }
253 
254         pos += read;
255 
256         return dst[0 .. written];
257     }
258 
259     /***************************************************************************
260 
261         Decodes '%' encoded characters in str, replacing them in-place.
262 
263         Checks whether the passed source string contains any characters encoded
264         according to the RFC 2396 escape format. (A '%' character followed by
265         two hexadecimal digits.)
266 
267         The non-standard 4-digit unicode encoding scheme is also supported ("%u"
268         followed by four hex digits). Such characters are converted to UTF-8.
269 
270         Note that the original content in str is overwritten with the decoded
271         content. The resulting content is at most as long as the original. The
272         returned string slices the valid content in str. str itself may contain
273         tailing junk.
274 
275         Params:
276             str = string to decode
277 
278         Returns:
279             the decoded str content (slices str from the beginning)
280 
281         Out:
282             The returned array slices str from the beginning.
283 
284     ***************************************************************************/
285 
286     public static mstring decode ( mstring str )
287     out (str_out)
288     {
289         assert (str_out.ptr is str.ptr);
290     }
291     do
292     {
293         size_t pos = 0;
294 
295         if (str.length)
296         {
297             scope iterator = new ChrSplitIterator('%');
298 
299             // Skip the beginning of str before the first '%'.
300 
301             foreach (chunk; iterator.reset(str))
302             {
303                 pos = chunk.length;
304                 break;
305             }
306 
307             bool had_percent = false;
308 
309             foreach (chunk; iterator)
310             {
311                 size_t read, written = 0;
312 
313                 if (chunk.length)
314                 {
315                     if (chunk[0] == 'u')
316                     {
317                         // Have a 'u': Assume four hex digits follow which denote
318                         // the character value; decode that character and copy the
319                         // UTF-8 sequence into str, starting from pos. Note that
320                         // since g_unichar_to_utf8() produces UTF-8 sequence of 6
321                         // bytes maximum, the UTF-8 sequence won't be longer than
322                         // the original "%u####" sequence.
323 
324                         read = 5;
325                         if (chunk.length >= read)
326                         {
327                             written = hex4(chunk[1 .. read], str[pos .. pos + 6]).length;
328                         }
329                     }
330                     else
331                     {
332                         // Assume two hex digits follow which denote the character
333                         // value; replace str[pos] with the corresponding character.
334 
335                         read = 2;
336                         if (chunk.length >= read)
337                         {
338                             written = hex2(chunk[0], chunk[1], str[pos]);
339                         }
340                     }
341                 }
342                 else
343                 {
344                     if (had_percent)
345                     {
346                         had_percent = false;
347                     }
348                     else
349                     {
350                         str[pos++] = '%';
351                         had_percent = true;
352                     }
353 
354                     continue;
355                 }
356 
357                 verify(written <= read);
358 
359                 // written = 0 => error: Pass through the erroneous sequence,
360                 // prepending the '%' that was skipped by the iterator.
361 
362                 if (!written)
363                 {
364                     if (had_percent)
365                     {
366                         had_percent = false;
367                     }
368                     else
369                     {
370                         str[pos] = '%';
371                         written = 1;
372                         had_percent = true;
373                     }
374 
375                     read = 0;
376                 }
377 
378                 pos += written;
379 
380                 // Move the rest of chunk to the front.
381 
382                 if (chunk.length > read)
383                 {
384                     cstring between = chunk[read .. $];
385 
386                     memmove(&str[pos], &between[0], between.length);
387 
388                     pos += between.length;
389                 }
390 
391                 had_percent = false;
392             }
393         }
394 
395         return str[0 .. pos];
396     }
397 
398     /***************************************************************************
399 
400         Creates a character c with the value specified by the 2-digit ASCII
401         hexadecimal number whose digits are hi and lo. For example, if
402         hi = 'E' or 'e' and lo = '9', c will be 0xE9.
403 
404         Params:
405             hi = most significant hexadecimal digit (ASCII)
406             lo = least significant hexadecimal digit (ASCII)
407             c  = output character
408 
409         Returns:
410             true on success or false if hi or lo or both are not a hexadecimal
411             digit.
412 
413      ***************************************************************************/
414 
415     static bool hex2 ( char hi, char lo, out char c )
416     {
417         int xhi = g_ascii_xdigit_value(hi),
418             xlo = g_ascii_xdigit_value(lo);
419 
420         if (xhi >= 0 && xlo >= 0)
421         {
422             c = cast(char) ((xhi << 4) | xlo);
423 
424             return true;
425         }
426         else
427         {
428             return false;
429         }
430     }
431 
432     /***************************************************************************
433 
434         Converts hex, which is expected to contain a 4-digit ASCII hexadecimal
435         number, into its corresponding UTF-8 character sequence.
436 
437         Params:
438             hex      = character code in hexadecimal representation (ASCII)
439             utf8_buf = destination buffer for the UTF-8 sequence of the
440                        character; the length must be at least 6; may contain
441                        tailing junk if the sequence is actually shorter
442 
443         Returns:
444             the UTF-8 sequence (slices the valid data in utf8_buf) on success or
445             an empty string on failure.
446 
447         In:
448             - hex.length must be 4,
449             - utf8_buf.length must at least be 6.
450 
451         Out:
452             The returned string slices utf8_buf from the beginning.
453 
454     ***************************************************************************/
455 
456     static mstring hex4 ( cstring hex, mstring utf8_buf )
457     out (utf8)
458     {
459         assert (utf8_buf.ptr is utf8.ptr);
460     }
461     do
462     {
463         verify (hex.length == 4);
464         verify (utf8_buf.length >= 6);
465 
466         int hihi = g_ascii_xdigit_value(hex[0]),
467             hilo = g_ascii_xdigit_value(hex[1]),
468             lohi = g_ascii_xdigit_value(hex[2]),
469             lolo = g_ascii_xdigit_value(hex[3]);
470 
471         size_t n = 0;
472 
473         if (hihi >= 0 && hilo >= 0 && lohi >= 0 && lolo >= 0)
474         {
475             dchar c = ((cast (dchar) hihi) << 0xC) |
476                       ((cast (dchar) hilo) << 0x8) |
477                       ((cast (dchar) lohi) << 0x4) |
478                       ((cast (dchar) lolo));
479 
480             n = cast (size_t) g_unichar_to_utf8(c, utf8_buf.ptr);
481         }
482 
483         return utf8_buf[0 .. n];
484     }
485 
486     /**************************************************************************
487 
488         To be overridden as an option, called by opApply().
489 
490         Determines whether each decoded character should be passed as 'foreach'
491         iteration variable string in its decoded or its original (encoded) form.
492         This can be used in cases where the decoding of only certain characters
493         is desired.
494 
495         By default always the decoded form is selected.
496 
497         Params:
498             decoded  = decoded form of the character
499             original = original (encoded) form
500 
501         Returns:
502             true to use the decoded or false to use the original (encoded) form.
503 
504      **************************************************************************/
505 
506     protected bool copyDecoded ( cstring decoded, cstring original )
507     {
508         return true;
509     }
510 }
511 
512 
513 unittest
514 {
515     scope decoder = new UrlDecoder("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207%"),
516           decoded = new char[0];
517 
518     foreach (chunk; decoder)
519     {
520         decoded ~= chunk;
521     }
522 
523     test (decoded == "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇%");
524 
525     test (UrlDecoder.decode("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207".dup) ==
526                    "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇");
527 }