1 /*******************************************************************************
2
3 Functions to convert non-ASCII and characters reserved in URLs to percent
4 encoded form.
5
6 Copyright:
7 Copyright (c) 2009-2016 dunnhumby Germany GmbH.
8 All rights reserved.
9
10 License:
11 Boost Software License Version 1.0. See LICENSE_BOOST.txt for details.
12 Alternatively, this file may be distributed under the terms of the Tango
13 3-Clause BSD License (see LICENSE_BSD.txt for details).
14
15 *******************************************************************************/
16
17 module ocean.net.util.UrlEncoder;
18
19
20 import ocean.meta.types.Qualifiers;
21 import ocean.core.Verify;
22 import core.stdc.ctype: isgraph;
23
24 version (unittest) import ocean.core.Test;
25
26 /******************************************************************************
27
28 Converts non-unreserved characters. Unreserved characters are the ASCII
29 alphanumeric characters and
30
31 -._~
32
33 .
34
35 @see http://tools.ietf.org/html/rfc3986#section-2.3
36
37 Special cases:
38
39 - The whitespace character 0x20 is encoded as "%20" (not "+").
40 - Characters below 0x20 and above 0x7E are encoded straight away, regardless
41 of any encoding or codepage. For example, the UTF-8 encoded string
42 "Münzstraße", which corresponds to the byte sequence
43 [0x4D, 0xC3, 0xBC, 0x6E, 0x7A, 0x73, 0x74, 0x72, 0x61, 0xC3, 0x9F, 0x65]
44 ...M .........ü ....n ...z ...s ...t ...r ...a .........ß ...e
45 , is encoded as "M%C3%BCnzstra%C3%9Fe".
46
47 UrlEncoder class
48
49 Memory friendly, suitable for stack-allocated 'scope' instances.
50
51 ******************************************************************************/
52
53 class EncodeNonUnreserved : PercentEncoder
54 {
55 /**************************************************************************
56
57 Character map, true for unreserved characters.
58
59 **************************************************************************/
60
61 static immutable bool[char.max + 1] unreserved =
62 [
63 'A': true, 'B': true, 'C': true, 'D': true, 'E': true, 'F': true,
64 'G': true, 'H': true, 'I': true, 'J': true, 'K': true, 'L': true,
65 'M': true, 'N': true, 'O': true, 'P': true, 'Q': true, 'R': true,
66 'S': true, 'T': true, 'U': true, 'V': true, 'W': true, 'X': true,
67 'Y': true, 'Z': true,
68 'a': true, 'b': true, 'c': true, 'd': true, 'e': true, 'f': true,
69 'g': true, 'h': true, 'i': true, 'j': true, 'k': true, 'l': true,
70 'm': true, 'n': true, 'o': true, 'p': true, 'q': true, 'r': true,
71 's': true, 't': true, 'u': true, 'v': true, 'w': true, 'x': true,
72 'y': true, 'z': true,
73 '0': true, '1': true, '2': true, '3': true, '4': true, '5': true,
74 '6': true, '7': true, '8': true, '9': true,
75 '-': true, '_': true, '.': true, '~': true
76 ];
77
78 /**************************************************************************
79
80 Constructor
81
82 Params:
83 source_in = source string
84
85 **************************************************************************/
86
87 public this ( cstring source_in = null )
88 {
89 super(source_in);
90 }
91
92 /**************************************************************************
93
94 Tells whether c should be converted.
95
96 Params:
97 c = character in question
98
99 Returns:
100 true if c should be converted or false otherwise.
101
102 **************************************************************************/
103
104 protected override bool encode ( char c )
105 {
106 return !this.unreserved[c];
107 }
108 }
109
110 /******************************************************************************
111
112 Encodes all characters except the ASCII graphic, that is, encodes ASCII
113 whitespace and control characters and non-ASCII characters.
114
115 ******************************************************************************/
116
117 class EncodeExceptAsciiGraph : PercentEncoder
118 {
119 /**************************************************************************
120
121 Constructor
122
123 Params:
124 source_in = source string
125
126 **************************************************************************/
127
128 public this ( cstring source_in = null )
129 {
130 super(source_in);
131 }
132
133 /**************************************************************************
134
135 Tells whether c should be converted.
136
137 Params:
138 c = character in question
139
140 Returns:
141 true if c should be converted or false otherwise.
142
143 **************************************************************************/
144
145 protected override bool encode ( char c )
146 {
147 return !.isgraph(c);
148 }
149 }
150
151 /******************************************************************************
152
153 Abstract encoder
154
155 ******************************************************************************/
156
157 class PercentEncoder
158 {
159 /**************************************************************************
160
161 Source string, may be changed at any time except during encoding
162 'foreach' iteration.
163
164 **************************************************************************/
165
166 public cstring source;
167
168 /**************************************************************************
169
170 Constructor
171
172 Params:
173 source_in = source string
174
175 **************************************************************************/
176
177 public this ( cstring source_in = null )
178 {
179 this.source = source_in;
180 }
181
182 /**************************************************************************
183
184 Encodes this.source in an 'foreach' iteration over encoded chunks.
185 Each chunk is guaranteed not to be empty.
186
187 **************************************************************************/
188
189 public int opApply ( scope int delegate ( ref cstring chunk ) dg )
190 {
191 int result = 0;
192
193 int callDg ( cstring chunk )
194 {
195 return result = dg(chunk);
196 }
197
198 size_t start = 0;
199 char[3] hex;
200
201 hex[0] = '%';
202
203 foreach (i, c; this.source)
204 {
205 if (this.encode(c))
206 {
207 verify(start <= i);
208
209 if (start < i)
210 {
211 if (callDg(this.source[start .. i])) return result;
212 }
213
214 static immutable hex_digits = "0123456789ABCDEF";
215
216 hex[1] = hex_digits [(c >> 4) & 0xF];
217 hex[2] = hex_digits [c & 0xF];
218
219 if (callDg(hex)) return result;
220
221 start = i + 1;
222 }
223 }
224
225 verify(start <= this.source.length);
226
227 return (start < this.source.length)?
228 callDg(this.source[start .. $]) : result;
229 }
230
231 /******************************************************************************
232
233 Encodes all characters except the ASCII graphic, that is, encodes ASCII
234 whitespace and control characters and non-ASCII characters.
235
236 ******************************************************************************/
237
238 protected abstract bool encode ( char c );
239 }
240
241
242 unittest
243 {
244 static void checkRange ( char first, char last )
245 {
246 for (char c = first; c <= last; c++)
247 {
248 test (EncodeNonUnreserved.unreserved[c],
249 "'" ~ c ~ "' is supposed to be unreserved");
250 }
251 }
252
253 checkRange('A', 'Z');
254 checkRange('a', 'z');
255 checkRange('0', '9');
256
257 foreach (c; "-_.~")
258 {
259 test (EncodeNonUnreserved.unreserved[c],
260 "'" ~ c ~ "' is supposed to be unreserved");
261 }
262
263 scope encoder = new EncodeNonUnreserved("For example, the octet "
264 ~ "corresponding to the tilde (\"~\") character is often encoded as "
265 ~ "\"%7E\" by older URI processing implementations; the \"%7E\" can be "
266 ~ "replaced by \"~\" without chänging its interpretation.");
267
268 static immutable istring[] chunks =
269 [
270 "For", "%20", "example", "%2C", "%20", "the", "%20", "octet", "%20",
271 "corresponding","%20", "to", "%20", "the", "%20", "tilde", "%20",
272 "%28", "%22", "~", "%22", "%29", "%20", "character", "%20", "is",
273 "%20", "often", "%20", "encoded", "%20", "as", "%20", "%22", "%25",
274 "7E", "%22", "%20", "by", "%20", "older", "%20", "URI", "%20",
275 "processing", "%20", "implementations", "%3B", "%20", "the", "%20",
276 "%22", "%25", "7E", "%22", "%20", "can", "%20", "be", "%20",
277 "replaced", "%20", "by", "%20", "%22", "~", "%22", "%20", "without",
278 "%20", "ch", "%C3", "%A4", "nging", "%20", "its", "%20",
279 "interpretation."
280 ];
281
282 size_t i = 0;
283
284 foreach (chunk; encoder)
285 {
286 test (i < chunks.length);
287 test (chunks[i++] == chunk);
288 }
289 }