1 /*******************************************************************************
2 
3     Functions to convert non-ASCII and characters reserved in URLs to percent
4     encoded form.
5 
6     Copyright:
7         Copyright (c) 2009-2016 dunnhumby Germany GmbH.
8         All rights reserved.
9 
10     License:
11         Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
12         Alternatively, this file may be distributed under the terms of the Tango
13         3-Clause BSD License (see LICENSE_BSD.txt for details).
14 
15 *******************************************************************************/
16 
17 module ocean.net.util.UrlEncoder;
18 
19 
20 import ocean.meta.types.Qualifiers;
21 import ocean.core.Verify;
22 import core.stdc.ctype: isgraph;
23 
24 version (unittest) import ocean.core.Test;
25 
26 /******************************************************************************
27 
28     Converts non-unreserved characters. Unreserved characters are the ASCII
29     alphanumeric characters and
30 
31         -._~
32 
33     .
34 
35     @see http://tools.ietf.org/html/rfc3986#section-2.3
36 
37     Special cases:
38 
39     - The whitespace character 0x20 is encoded as "%20" (not "+").
40     - Characters below 0x20 and above 0x7E are encoded straight away, regardless
41       of any encoding or codepage. For example, the UTF-8 encoded string
42       "Münzstraße", which corresponds to the byte sequence
43       [0x4D, 0xC3, 0xBC, 0x6E, 0x7A, 0x73, 0x74, 0x72, 0x61, 0xC3, 0x9F, 0x65]
44        ...M  .........ü  ....n ...z  ...s  ...t  ...r  ...a  .........ß  ...e
45       , is encoded as "M%C3%BCnzstra%C3%9Fe".
46 
47     UrlEncoder class
48 
49     Memory friendly, suitable for stack-allocated 'scope' instances.
50 
51  ******************************************************************************/
52 
53 class EncodeNonUnreserved : PercentEncoder
54 {
55     /**************************************************************************
56 
57         Character map, true for unreserved characters.
58 
59      **************************************************************************/
60 
61     static immutable bool[char.max + 1] unreserved =
62     [
63         'A': true, 'B': true, 'C': true, 'D': true, 'E': true, 'F': true,
64         'G': true, 'H': true, 'I': true, 'J': true, 'K': true, 'L': true,
65         'M': true, 'N': true, 'O': true, 'P': true, 'Q': true, 'R': true,
66         'S': true, 'T': true, 'U': true, 'V': true, 'W': true, 'X': true,
67         'Y': true, 'Z': true,
68         'a': true, 'b': true, 'c': true, 'd': true, 'e': true, 'f': true,
69         'g': true, 'h': true, 'i': true, 'j': true, 'k': true, 'l': true,
70         'm': true, 'n': true, 'o': true, 'p': true, 'q': true, 'r': true,
71         's': true, 't': true, 'u': true, 'v': true, 'w': true, 'x': true,
72         'y': true, 'z': true,
73         '0': true, '1': true, '2': true, '3': true, '4': true, '5': true,
74         '6': true, '7': true, '8': true, '9': true,
75         '-': true, '_': true, '.': true, '~': true
76     ];
77 
78     /**************************************************************************
79 
80         Constructor
81 
82         Params:
83             source_in = source string
84 
85      **************************************************************************/
86 
87     public this ( cstring source_in = null )
88     {
89         super(source_in);
90     }
91 
92     /**************************************************************************
93 
94         Tells whether c should be converted.
95 
96         Params:
97             c = character in question
98 
99         Returns:
100             true if c should be converted or false otherwise.
101 
102      **************************************************************************/
103 
104     protected override bool encode ( char c )
105     {
106         return !this.unreserved[c];
107     }
108 }
109 
110 /******************************************************************************
111 
112     Encodes all characters except the ASCII graphic, that is, encodes ASCII
113     whitespace and control characters and non-ASCII characters.
114 
115  ******************************************************************************/
116 
117 class EncodeExceptAsciiGraph : PercentEncoder
118 {
119     /**************************************************************************
120 
121         Constructor
122 
123         Params:
124             source_in = source string
125 
126      **************************************************************************/
127 
128     public this ( cstring source_in = null )
129     {
130         super(source_in);
131     }
132 
133     /**************************************************************************
134 
135         Tells whether c should be converted.
136 
137         Params:
138             c = character in question
139 
140         Returns:
141             true if c should be converted or false otherwise.
142 
143      **************************************************************************/
144 
145     protected override bool encode ( char c )
146     {
147         return !.isgraph(c);
148     }
149 }
150 
151 /******************************************************************************
152 
153     Abstract encoder
154 
155  ******************************************************************************/
156 
157 class PercentEncoder
158 {
159     /**************************************************************************
160 
161         Source string, may be changed at any time except during encoding
162         'foreach' iteration.
163 
164      **************************************************************************/
165 
166     public cstring source;
167 
168     /**************************************************************************
169 
170         Constructor
171 
172         Params:
173             source_in = source string
174 
175      **************************************************************************/
176 
177     public this ( cstring source_in = null )
178     {
179         this.source = source_in;
180     }
181 
182     /**************************************************************************
183 
184         Encodes this.source in an 'foreach' iteration over encoded chunks.
185         Each chunk is guaranteed not to be empty.
186 
187      **************************************************************************/
188 
189     public int opApply ( scope int delegate ( ref cstring chunk ) dg )
190     {
191         int result = 0;
192 
193         int callDg ( cstring chunk )
194         {
195             return result = dg(chunk);
196         }
197 
198         size_t  start = 0;
199         char[3] hex;
200 
201         hex[0] = '%';
202 
203         foreach (i, c; this.source)
204         {
205             if (this.encode(c))
206             {
207                 verify(start <= i);
208 
209                 if (start < i)
210                 {
211                     if (callDg(this.source[start .. i])) return result;
212                 }
213 
214                 static immutable hex_digits = "0123456789ABCDEF";
215 
216                 hex[1] = hex_digits [(c >> 4) & 0xF];
217                 hex[2] = hex_digits [c & 0xF];
218 
219                 if (callDg(hex)) return result;
220 
221                 start = i + 1;
222             }
223         }
224 
225         verify(start <= this.source.length);
226 
227         return (start < this.source.length)?
228                 callDg(this.source[start .. $]) : result;
229     }
230 
231     /******************************************************************************
232 
233         Encodes all characters except the ASCII graphic, that is, encodes ASCII
234         whitespace and control characters and non-ASCII characters.
235 
236      ******************************************************************************/
237 
238     protected abstract bool encode ( char c );
239 }
240 
241 
242 unittest
243 {
244     static void checkRange ( char first, char last )
245     {
246         for (char c = first; c <= last; c++)
247         {
248             test (EncodeNonUnreserved.unreserved[c],
249                     "'" ~ c ~ "' is supposed to be unreserved");
250         }
251     }
252 
253     checkRange('A', 'Z');
254     checkRange('a', 'z');
255     checkRange('0', '9');
256 
257     foreach (c; "-_.~")
258     {
259         test (EncodeNonUnreserved.unreserved[c],
260                 "'" ~ c ~ "' is supposed to be unreserved");
261     }
262 
263     scope encoder = new EncodeNonUnreserved("For example, the octet "
264     ~ "corresponding to the tilde (\"~\") character is often encoded as "
265     ~ "\"%7E\" by older URI processing implementations; the \"%7E\" can be "
266     ~ "replaced by \"~\" without chänging its interpretation.");
267 
268     static immutable istring[] chunks =
269     [
270         "For", "%20", "example", "%2C", "%20", "the", "%20", "octet", "%20",
271         "corresponding","%20", "to", "%20", "the", "%20", "tilde", "%20",
272         "%28", "%22", "~", "%22", "%29", "%20", "character", "%20", "is",
273         "%20", "often", "%20", "encoded", "%20", "as", "%20", "%22", "%25",
274         "7E", "%22", "%20", "by", "%20", "older", "%20", "URI", "%20",
275         "processing", "%20", "implementations", "%3B", "%20", "the", "%20",
276         "%22", "%25", "7E", "%22", "%20", "can", "%20", "be", "%20",
277         "replaced", "%20", "by", "%20", "%22", "~", "%22", "%20", "without",
278         "%20", "ch", "%C3", "%A4", "nging", "%20", "its", "%20",
279         "interpretation."
280     ];
281 
282     size_t i = 0;
283 
284     foreach (chunk; encoder)
285     {
286         test (i < chunks.length);
287         test (chunks[i++] == chunk);
288     }
289 }