1 /*******************************************************************************
2 
3         Copyright:
4             Copyright (C) 2008 Kris Bell.
5             Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
6             All rights reserved.
7 
8         License:
9             Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
10             See LICENSE_TANGO.txt for details.
11 
12         Version: Aug 2008: Initial release
13 
14         Authors: Kris
15 
16 *******************************************************************************/
17 
18 module ocean.text.xml.DocEntity;
19 
20 import Util = ocean.text.Util;
21 
22 /******************************************************************************
23 
24         Convert XML entity patterns to normal characters
25 
26         <pre>
27         &amp; => ;
28         &quot; => "
29         etc.
30         </pre>
31 
32 ******************************************************************************/
33 
34 T[] fromEntity (T) (T[] src, T[] dst = null)
35 {
36         int delta;
37         auto s = src.ptr;
38         auto len = src.length;
39 
40         // take a peek first to see if there's anything
41         if ((delta = Util.indexOf (s, '&', len)) < len)
42            {
43            // make some room if not enough provided
44            if (dst.length < src.length)
45                dst.length = src.length;
46            auto d = dst.ptr;
47 
48            // copy segments over, a chunk at a time
49            do {
50               d [0 .. delta] = s [0 .. delta];
51               len -= delta;
52               s += delta;
53               d += delta;
54 
55               // translate entity
56               auto token = 0;
57 
58               switch (s[1])
59                      {
60                       case 'a':
61                            if (len > 4 && s[1..5] == "amp;")
62                                *d++ = '&', token = 5;
63                            else
64                            if (len > 5 && s[1..6] == "apos;")
65                                *d++ = '\'', token = 6;
66                            break;
67 
68                       case 'g':
69                            if (len > 3 && s[1..4] == "gt;")
70                                *d++ = '>', token = 4;
71                            break;
72 
73                       case 'l':
74                            if (len > 3 && s[1..4] == "lt;")
75                                *d++ = '<', token = 4;
76                            break;
77 
78                       case 'q':
79                            if (len > 5 && s[1..6] == "quot;")
80                                *d++ = '"', token = 6;
81                            break;
82 
83                       default:
84                            break;
85                      }
86 
87               if (token is 0)
88                   *d++ = '&', token = 1;
89 
90               s += token, len -= token;
91               } while ((delta = Util.indexOf (s, '&', len)) < len);
92 
93            // copy tail too
94            d [0 .. len] = s [0 .. len];
95            return dst [0 .. (d + len) - dst.ptr];
96            }
97         return src;
98 }
99 
100 
101 /******************************************************************************
102 
103         Convert XML entity patterns to normal characters
104         ---
105         &amp; => ;
106         &quot => "
107         etc
108         ---
109 
110         This variant does not require an interim workspace, and instead
111         emits directly via the provided delegate
112 
113 ******************************************************************************/
114 
115 void fromEntity (T) (T[] src, scope void delegate(T[]) emit)
116 {
117         int delta;
118         auto s = src.ptr;
119         auto len = src.length;
120 
121         // take a peek first to see if there's anything
122         if ((delta = Util.indexOf (s, '&', len)) < len)
123            {
124            // copy segments over, a chunk at a time
125            do {
126               emit (s [0 .. delta]);
127               len -= delta;
128               s += delta;
129 
130               // translate entity
131               auto token = 0;
132 
133               switch (s[1])
134                      {
135                       case 'a':
136                            if (len > 4 && s[1..5] == "amp;")
137                                emit("&"), token = 5;
138                            else
139                            if (len > 5 && s[1..6] == "apos;")
140                                emit("'"), token = 6;
141                            break;
142 
143                       case 'g':
144                            if (len > 3 && s[1..4] == "gt;")
145                                emit(">"), token = 4;
146                            break;
147 
148                       case 'l':
149                            if (len > 3 && s[1..4] == "lt;")
150                                emit("<"), token = 4;
151                            break;
152 
153                       case 'q':
154                            if (len > 5 && s[1..6] == "quot;")
155                                emit("\""), token = 6;
156                            break;
157 
158                       default:
159                            break;
160                      }
161 
162               if (token is 0)
163                   emit ("&"), token = 1;
164 
165               s += token, len -= token;
166               } while ((delta = Util.indexOf (s, '&', len)) < len);
167 
168            // copy tail too
169            emit (s [0 .. len]);
170            }
171         else
172            emit (src);
173 }
174 
175 
176 /******************************************************************************
177 
178         Convert reserved chars to entities. For example: " => &quot;
179 
180         Either a slice of the provided output buffer is returned, or the
181         original content, depending on whether there were reserved chars
182         present or not. The output buffer should be sufficiently large to
183         accomodate the converted output, or it will be allocated from the
184         heap instead
185 
186 ******************************************************************************/
187 
188 T[] toEntity(T) (T[] src, T[] dst = null)
189 {
190         T[]  entity;
191         auto s = src.ptr;
192         auto t = s;
193         auto e = s + src.length;
194         auto index = 0;
195 
196         while (s < e)
197                switch (*s)
198                       {
199                       case '"':
200                            entity = "&quot;";
201                            goto common;
202 
203                       case '>':
204                            entity = "&gt;";
205                            goto common;
206 
207                       case '<':
208                            entity = "&lt;";
209                            goto common;
210 
211                       case '&':
212                            entity = "&amp;";
213                            goto common;
214 
215                       case '\'':
216                            entity = "&apos;";
217                            goto common;
218 
219                       common:
220                            auto len = s - t;
221                            if (dst.length <= index + len + entity.length)
222                                dst.length = (dst.length + len + entity.length) + dst.length / 2;
223 
224                            dst [index .. index + len] = t [0 .. len];
225                            index += len;
226 
227                            dst [index .. index + entity.length] = entity;
228                            index += entity.length;
229                            t = ++s;
230                            break;
231 
232                       default:
233                            ++s;
234                            break;
235                       }
236 
237 
238         // did we change anything?
239         if (index)
240            {
241            // copy tail too
242            auto len = e - t;
243            if (dst.length <= index + len)
244                dst.length = index + len;
245 
246            dst [index .. index + len] = t [0 .. len];
247            return dst [0 .. index + len];
248            }
249 
250         return src;
251 }
252 
253 
254 /******************************************************************************
255 
256         Convert reserved chars to entities. For example: " => &quot;
257 
258         This variant does not require an interim workspace, and instead
259         emits directly via the provided delegate
260 
261 ******************************************************************************/
262 
263 void toEntity(T) (T[] src, scope void delegate(T[]) emit)
264 {
265         T[]  entity;
266         auto s = src.ptr;
267         auto t = s;
268         auto e = s + src.length;
269 
270         while (s < e)
271                switch (*s)
272                       {
273                       case '"':
274                            entity = "&quot;";
275                            goto common;
276 
277                       case '>':
278                            entity = "&gt;";
279                            goto common;
280 
281                       case '<':
282                            entity = "&lt;";
283                            goto common;
284 
285                       case '&':
286                            entity = "&amp;";
287                            goto common;
288 
289                       case '\'':
290                            entity = "&apos;";
291                            goto common;
292 
293                       common:
294                            if (s - t > 0)
295                                emit (t [0 .. s - t]);
296                            emit (entity);
297                            t = ++s;
298                            break;
299 
300                       default:
301                            ++s;
302                            break;
303                       }
304 
305         // did we change anything? Copy tail also
306         if (entity.length)
307             emit (t [0 .. e - t]);
308         else
309            emit (src);
310 }