1 /******************************************************************************
2 
3     UTF-8 URL decoder
4 
5     Uses the glib 2.0, use
6 
7         -Lglib-2.0
8 
9     as linking parameter.
10 
11     Copyright:
12         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
13         All rights reserved.
14 
15     License:
16         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
17         Alternatively, this file may be distributed under the terms of the Tango
18         3-Clause BSD License (see LICENSE_BSD.txt for details).
19 
20  ******************************************************************************/
21 
22 module ocean.net.util.UrlDecoder;
23 
24 
25 import ocean.transition;
26 
27 import ocean.core.Verify;
28 
29 import ocean.text.util.SplitIterator: ChrSplitIterator;
30 
31 import ocean.stdc.string: memmove;
32 
33 version(UnitTest) import ocean.core.Test;
34 
35 extern (C) private
36 {
37     /**************************************************************************
38 
39         Determines the numeric value of a character as a hexadecimal digit.
40 
41         @see http://developer.gnome.org/glib/stable/glib-String-Utility-Functions.html#g-ascii-xdigit-value
42 
43         Params:
44             c = an ASCII character.
45 
46         Returns:
47             If c is a hex digit its numeric value. Otherwise, -1.
48 
49      **************************************************************************/
50 
51     int   g_ascii_xdigit_value (char c);
52 
53     /**************************************************************************
54 
55         Converts a single character to UTF-8.
56 
57         @see http://developer.gnome.org/glib/stable/glib-Unicode-Manipulation.html#g-unichar-to-utf8
58 
59         Params:
60             c      = a Unicode character code
61             outbuf = output buffer, must have at least 6 bytes of space.
62                      If NULL, the length will be computed and returned and
63                      nothing will be written to outbuf.
64 
65         Returns:
66             number of bytes written
67 
68      **************************************************************************/
69 
70     int g_unichar_to_utf8(dchar c, char* outbuf);
71 }
72 
73 /******************************************************************************
74 
75     UrlDecoder class
76 
77     Memory friendly, suitable for stack-allocated 'scope' instances.
78 
79  ******************************************************************************/
80 
81 class UrlDecoder
82 {
83     /**************************************************************************
84 
85         Source string, may be changed at any time except during decoding
86         'foreach' iteration.
87 
88      **************************************************************************/
89 
90     public cstring source;
91 
92     /**************************************************************************
93 
94         Constructor
95 
96         Params:
97             source_in = source string
98 
99      **************************************************************************/
100 
101     public this ( cstring source_in = null )
102     {
103         this.source = source_in;
104     }
105 
106     /***************************************************************************
107 
108         Decodes this.source in an 'foreach' iteration over decoded chunks.
109 
110         Checks whether the passed source string contains any characters encoded
111         according to the RFC 2396 escape format. (A '%' character followed by
112         two hexadecimal digits.)
113 
114         The non-standard 4-digit unicode encoding scheme is also supported ("%u"
115         followed by four hex digits). Such characters are converted to UTF-8.
116 
117     **************************************************************************/
118 
119     public int opApply ( scope int delegate ( ref cstring chunk ) dg )
120     {
121         int callDg ( cstring str )
122         {
123             return dg(str);
124         }
125 
126         scope iterate_markers = new ChrSplitIterator('%');
127 
128         iterate_markers.include_remaining = false;
129 
130         size_t first_marker = iterate_markers.reset(this.source).locateDelim();
131 
132         if (first_marker < this.source.length)
133         {
134             int result = callDg(this.source[0 .. first_marker]);
135 
136             if (!result) foreach (ref pos, between; iterate_markers.reset(this.source[first_marker .. $]))
137             {
138                 result = dg(between);
139 
140                 if (result) break;
141 
142                 auto remaining = iterate_markers.remaining;
143 
144                 char[6] decoded_buf;
145                 size_t read_pos = 0;
146 
147                 auto decoded = decodeCharacter(decoded_buf, remaining, read_pos);
148 
149                 if (decoded.length)
150                 {
151                     verify(read_pos != 0);
152 
153                     auto original = this.source[0 .. read_pos];
154 
155                     result = callDg(this.copyDecoded(decoded, original)?
156                                         decoded : original);
157 
158                     pos += read_pos;
159                 }
160                 else                                           // decoding error
161                 {
162                     verify(!read_pos);
163 
164                     result = callDg("%");
165                 }
166 
167                 if (result) break;
168             }
169 
170             return result? result : callDg(iterate_markers.remaining);
171         }
172         else
173         {
174             return dg(this.source);
175         }
176     }
177 
178     /***************************************************************************
179 
180         Extracts a single character from the specified position in the passed
181         string, which is expected to be the index of a character preceded by a
182         '%'.
183         source[pos .. $] is scanned to see if they represent an encoded
184         character in either the RFC 2396 escape format (%XX) or the non-standard
185         escape format (%uXXXX) or if they should represent a '%' (%%).
186 
187         (See: http://en.wikipedia.org/wiki/Percent-encoding)
188 
189         On success the extracted character is written as utf8 into the provided
190         output buffer and pos is increased to the index right after the last
191         consumed character in source. On failure pos remains unchanged.
192 
193         Params:
194             dst    = string buffer to receive decoded characters
195             source = character string to decode a character from; may be
196                      empty or null which will result in failure
197             pos    = position in source
198 
199         Returns:
200             a slice to the UTF-8 representation of the decoded character in dst
201             on success or an empty string on failure. The returned string is
202             guaranteed to slice dst from dst[0].
203 
204     ***************************************************************************/
205 
206     public static mstring decodeCharacter ( mstring dst, cstring source, ref size_t pos )
207     out (slice)
208     {
209         assert (slice.ptr is dst.ptr, typeof (this).stringof ~ ".decodeCharacter: bad returned slice");
210         assert(pos <= source.length, typeof (this).stringof ~ ".decodeCharacter (out): offset out of array bounds");
211     }
212     body
213     {
214         verify(
215             pos <= source.length,
216             typeof (this).stringof ~
217                 ".decodeCharacter (in): offset out of array bounds"
218         );
219 
220         auto src = source[pos .. $];
221 
222         size_t read    = 0,
223                written = 0;
224 
225         if (src.length) switch (src[0])
226         {
227             default:
228                 if (src.length >= 2)
229                 {
230                     written = hex2(src[0], src[1], dst[0]);
231 
232                     if (written)
233                     {
234                         read = 2;
235                     }
236                 }
237                 break;
238 
239             case 'u':
240                 if (src.length >= 5)
241                 {
242                     written = hex4(src[1 .. 5], dst).length;
243 
244                     if (written)
245                     {
246                         read = 5;
247                     }
248                 }
249                 break;
250 
251             case '%':
252                 read  = 1;
253                 written = 1;
254                 dst[0] = src[0];
255         }
256 
257         pos += read;
258 
259         return dst[0 .. written];
260     }
261 
262     /***************************************************************************
263 
264         Decodes '%' encoded characters in str, replacing them in-place.
265 
266         Checks whether the passed source string contains any characters encoded
267         according to the RFC 2396 escape format. (A '%' character followed by
268         two hexadecimal digits.)
269 
270         The non-standard 4-digit unicode encoding scheme is also supported ("%u"
271         followed by four hex digits). Such characters are converted to UTF-8.
272 
273         Note that the original content in str is overwritten with the decoded
274         content. The resulting content is at most as long as the original. The
275         returned string slices the valid content in str. str itself may contain
276         tailing junk.
277 
278         Params:
279             str = string to decode
280 
281         Returns:
282             the decoded str content (slices str from the beginning)
283 
284         Out:
285             The returned array slices str from the beginning.
286 
287     ***************************************************************************/
288 
289     public static mstring decode ( mstring str )
290     out (str_out)
291     {
292         assert (str_out.ptr is str.ptr);
293     }
294     body
295     {
296         size_t pos = 0;
297 
298         if (str.length)
299         {
300             scope iterator = new ChrSplitIterator('%');
301 
302             // Skip the beginning of str before the first '%'.
303 
304             foreach (chunk; iterator.reset(str))
305             {
306                 pos = chunk.length;
307                 break;
308             }
309 
310             bool had_percent = false;
311 
312             foreach (chunk; iterator)
313             {
314                 size_t read, written = 0;
315 
316                 if (chunk.length)
317                 {
318                     if (chunk[0] == 'u')
319                     {
320                         // Have a 'u': Assume four hex digits follow which denote
321                         // the character value; decode that character and copy the
322                         // UTF-8 sequence into str, starting from pos. Note that
323                         // since g_unichar_to_utf8() produces UTF-8 sequence of 6
324                         // bytes maximum, the UTF-8 sequence won't be longer than
325                         // the original "%u####" sequence.
326 
327                         read = 5;
328                         if (chunk.length >= read)
329                         {
330                             written = hex4(chunk[1 .. read], str[pos .. pos + 6]).length;
331                         }
332                     }
333                     else
334                     {
335                         // Assume two hex digits follow which denote the character
336                         // value; replace str[pos] with the corresponding character.
337 
338                         read = 2;
339                         if (chunk.length >= read)
340                         {
341                             written = hex2(chunk[0], chunk[1], str[pos]);
342                         }
343                     }
344                 }
345                 else
346                 {
347                     if (had_percent)
348                     {
349                         had_percent = false;
350                     }
351                     else
352                     {
353                         str[pos++] = '%';
354                         had_percent = true;
355                     }
356 
357                     continue;
358                 }
359 
360                 verify(written <= read);
361 
362                 // written = 0 => error: Pass through the erroneous sequence,
363                 // prepending the '%' that was skipped by the iterator.
364 
365                 if (!written)
366                 {
367                     if (had_percent)
368                     {
369                         had_percent = false;
370                     }
371                     else
372                     {
373                         str[pos] = '%';
374                         written = 1;
375                         had_percent = true;
376                     }
377 
378                     read = 0;
379                 }
380 
381                 pos += written;
382 
383                 // Move the rest of chunk to the front.
384 
385                 if (chunk.length > read)
386                 {
387                     cstring between = chunk[read .. $];
388 
389                     memmove(&str[pos], &between[0], between.length);
390 
391                     pos += between.length;
392                 }
393 
394                 had_percent = false;
395             }
396         }
397 
398         return str[0 .. pos];
399     }
400 
401     /***************************************************************************
402 
403         Creates a character c with the value specified by the 2-digit ASCII
404         hexadecimal number whose digits are hi and lo. For example, if
405         hi = 'E' or 'e' and lo = '9', c will be 0xE9.
406 
407         Params:
408             hi = most significant hexadecimal digit (ASCII)
409             lo = least significant hexadecimal digit (ASCII)
410             c  = output character
411 
412         Returns:
413             true on success or false if hi or lo or both are not a hexadecimal
414             digit.
415 
416      ***************************************************************************/
417 
418     static bool hex2 ( char hi, char lo, out char c )
419     {
420         int xhi = g_ascii_xdigit_value(hi),
421             xlo = g_ascii_xdigit_value(lo);
422 
423         if (xhi >= 0 && xlo >= 0)
424         {
425             c = cast(char) ((xhi << 4) | xlo);
426 
427             return true;
428         }
429         else
430         {
431             return false;
432         }
433     }
434 
435     /***************************************************************************
436 
437         Converts hex, which is expected to contain a 4-digit ASCII hexadecimal
438         number, into its corresponding UTF-8 character sequence.
439 
440         Params:
441             hex      = character code in hexadecimal representation (ASCII)
442             utf8_buf = destination buffer for the UTF-8 sequence of the
443                        character; the length must be at least 6; may contain
444                        tailing junk if the sequence is actually shorter
445 
446         Returns:
447             the UTF-8 sequence (slices the valid data in utf8_buf) on success or
448             an empty string on failure.
449 
450         In:
451             - hex.length must be 4,
452             - utf8_buf.length must at least be 6.
453 
454         Out:
455             The returned string slices utf8_buf from the beginning.
456 
457     ***************************************************************************/
458 
459     static mstring hex4 ( cstring hex, mstring utf8_buf )
460     out (utf8)
461     {
462         assert (utf8_buf.ptr is utf8.ptr);
463     }
464     body
465     {
466         verify (hex.length == 4);
467         verify (utf8_buf.length >= 6);
468 
469         int hihi = g_ascii_xdigit_value(hex[0]),
470             hilo = g_ascii_xdigit_value(hex[1]),
471             lohi = g_ascii_xdigit_value(hex[2]),
472             lolo = g_ascii_xdigit_value(hex[3]);
473 
474         size_t n = 0;
475 
476         if (hihi >= 0 && hilo >= 0 && lohi >= 0 && lolo >= 0)
477         {
478             dchar c = ((cast (dchar) hihi) << 0xC) |
479                       ((cast (dchar) hilo) << 0x8) |
480                       ((cast (dchar) lohi) << 0x4) |
481                       ((cast (dchar) lolo));
482 
483             n = cast (size_t) g_unichar_to_utf8(c, utf8_buf.ptr);
484         }
485 
486         return utf8_buf[0 .. n];
487     }
488 
489     /**************************************************************************
490 
491         To be overridden as an option, called by opApply().
492 
493         Determines whether each decoded character should be passed as 'foreach'
494         iteration variable string in its decoded or its original (encoded) form.
495         This can be used in cases where the decoding of only certain characters
496         is desired.
497 
498         By default always the decoded form is selected.
499 
500         Params:
501             decoded  = decoded form of the character
502             original = original (encoded) form
503 
504         Returns:
505             true to use the decoded or false to use the original (encoded) form.
506 
507      **************************************************************************/
508 
509     protected bool copyDecoded ( cstring decoded, cstring original )
510     {
511         return true;
512     }
513 }
514 
515 
516 unittest
517 {
518     scope decoder = new UrlDecoder("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207%"),
519           decoded = new char[0];
520 
521     foreach (chunk; decoder)
522     {
523         decoded ~= chunk;
524     }
525 
526     test (decoded == "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇%");
527 
528     test (UrlDecoder.decode("%Die %uKatze %u221E%u221E tritt die Treppe %% krumm. %u2207".dup) ==
529                    "%Die %uKatze ∞∞ tritt die Treppe % krumm. ∇");
530 }