ocean.text.convert.Integer_tango source code

1 /*******************************************************************************
2 
3     A set of functions for converting between string and integer
4     values.
5 
6     Applying the D "import alias" mechanism to this module is highly
7     recommended, in order to limit namespace pollution:
8     ---
9     import Integer = ocean.text.convert.Integer_tango;
10 
11     auto i = Integer.parse ("32767");
12     ---
13 
14     Copyright:
15         Copyright (c) 2004 Kris Bell.
16         Some parts copyright (c) 2009-2016 dunnhumby Germany GmbH.
17         All rights reserved.
18 
19     License:
20         Tango Dual License: 3-Clause BSD License / Academic Free License v3.0.
21         See LICENSE_TANGO.txt for details.
22 
23     Version: Initial release: Nov 2005
24 
25     Authors: Kris
26 
27  *******************************************************************************/
28 
29 module ocean.text.convert.Integer_tango;
30 
31 import ocean.meta.types.Qualifiers;
32 import ocean.core.ExceptionDefinitions;
33 import ocean.core.Verify;
34 import ocean.meta.traits.Basic;
35 
36 /******************************************************************************
37 
38     Parse an integer value from the provided 'digits' string.
39 
40     The string is inspected for a sign and an optional radix
41     prefix. A radix may be provided as an argument instead,
42     whereupon it must match the prefix (where present). When
43     radix is set to zero, conversion will default to decimal.
44 
45     Throws: IllegalArgumentException where the input text is not parsable
46     in its entirety.
47 
48     See_also: the low level functions parse() and convert()
49 
50  ******************************************************************************/
51 
52 int toInt(T) (T[] digits, uint radix=0)
53 {
54     auto x = toLong (digits, radix);
55     if (x > int.max)
56         throw new IllegalArgumentException ("Integer.toInt :: integer overflow");
57     return cast(int) x;
58 }
59 
60 /******************************************************************************
61 
62   Parse an integer value from the provided 'digits' string.
63 
64   The string is inspected for a sign and an optional radix
65   prefix. A radix may be provided as an argument instead,
66   whereupon it must match the prefix (where present). When
67   radix is set to zero, conversion will default to decimal.
68 
69 Throws: IllegalArgumentException where the input text is not parsable
70 in its entirety.
71 
72 See_also: the low level functions parse() and convert()
73 
74  ******************************************************************************/
75 
76 long toLong(T) (T[] digits, uint radix=0)
77 {
78     uint len;
79 
80     auto x = parse (digits, radix, &len);
81     if (len < digits.length)
82         throw new IllegalArgumentException ("Integer.toLong :: invalid literal");
83     return x;
84 }
85 
86 /******************************************************************************
87 
88   Parse an unsignedinteger value from the provided 'digits' string.
89 
90   The string is inspected for an optional radix prefix. A
91   radix may be provided as an argument instead, whereupon
92   it must match the prefix (where present). When radix is
93   set to zero, conversion will default to decimal.
94 
95 Throws: IllegalArgumentException where the input text is not parsable
96 in its entirety.
97 
98 See_also: the low level functions parse() and convert()
99 
100  ******************************************************************************/
101 
102 ulong toUlong(T) (T[] digits, uint radix=0)
103 {
104     bool sign = false;
105 
106     auto eaten = trim (digits, sign, radix);
107     if (sign)
108         throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");
109 
110     uint len = 0;
111     auto value = convert (digits[eaten..$], radix, &len);
112     if (len == 0 || eaten + len < digits.length)
113         throw new IllegalArgumentException ("Integer.toUlong :: invalid literal");
114 
115     return value;
116 }
117 
118 /******************************************************************************
119 
120   Wrapper to make life simpler. Returns a text version
121   of the provided value.
122 
123   See format() for details
124 
125  ******************************************************************************/
126 
127 char[] toString (long i, char[] fmt = null)
128 {
129     char[66] tmp = void;
130     return format (tmp, i, fmt).dup;
131 }
132 
133 /******************************************************************************
134 
135   Wrapper to make life simpler. Returns a text version
136   of the provided value.
137 
138   See format() for details
139 
140  ******************************************************************************/
141 
142 wchar[] toString16 (long i, wchar[] fmt = null)
143 {
144     wchar[66] tmp = void;
145     return format (tmp, i, fmt).dup;
146 }
147 
148 /******************************************************************************
149 
150   Wrapper to make life simpler. Returns a text version
151   of the provided value.
152 
153   See format() for details
154 
155  ******************************************************************************/
156 
157 dchar[] toString32 (long i, dchar[] fmt = null)
158 {
159     dchar[66] tmp = void;
160     return format (tmp, i, fmt).dup;
161 }
162 
163 /*******************************************************************************
164 
165   Supports format specifications via an array, where format follows
166   the notation given below:
167   ---
168   type width prefix
169   ---
170 
171   Type is one of [d, g, u, b, x, o] or uppercase equivalent, and
172   dictates the conversion radix or other semantics.
173 
174   Width is optional and indicates a minimum width for zero-padding,
175   while the optional prefix is one of ['#', ' ', '+'] and indicates
176   what variety of prefix should be placed in the output. e.g.
177   ---
178   "d"     => integer
179   "u"     => unsigned
180   "o"     => octal
181   "b"     => binary
182   "x"     => hexadecimal
183   "X"     => hexadecimal uppercase
184 
185   "d+"    => integer prefixed with "+"
186   "b#"    => binary prefixed with "0b"
187   "x#"    => hexadecimal prefixed with "0x"
188   "X#"    => hexadecimal prefixed with "0X"
189 
190   "d8"    => decimal padded to 8 places as required
191   "b8"    => binary padded to 8 places as required
192   "b8#"   => binary padded to 8 places and prefixed with "0b"
193   ---
194 
195   Note that the specified width is exclusive of the prefix, though
196   the width padding will be shrunk as necessary in order to ensure
197   a requested prefix can be inserted into the provided output.
198 
199  *******************************************************************************/
200 
201 const(T)[] format(T, N) (T[] dst, N i, in T[] fmt = null)
202 {
203     static assert(isIntegerType!(N),
204                   "Integer_tango.format only supports integers");
205 
206     char    pre,
207             type;
208     int     width;
209 
210     decode (fmt, type, pre, width);
211     return formatter (dst, i, type, pre, width);
212 }
213 
214 private void decode(T) (T[] fmt, ref char type, out char pre, out int width)
215 {
216     if (fmt.length is 0)
217         type = 'd';
218     else
219     {
220         type = cast(char) fmt[0];
221         if (fmt.length > 1)
222         {
223             auto p = &fmt[1];
224             for (int j=1; j < fmt.length; ++j, ++p)
225             {
226                 if (*p >= '0' && *p <= '9')
227                     width = width * 10 + (*p - '0');
228                 else
229                     pre = cast(char) *p;
230             }
231         }
232     }
233 }
234 
235 private struct _FormatterInfo(T)
236 {
237     byte    radix;
238     T[]     prefix;
239     T[]     numbers;
240 }
241 
242 const(T)[] formatter(T, N) (T[] dst, N i_, char type, char pre, int width)
243 {
244     static assert(isIntegerType!(N),
245                   "Integer_tango.formatter only supports integers");
246     Unqual!(N) i = i_;
247 
248 
249     static immutable immutable(T)[] lower = "0123456789abcdef";
250     static immutable immutable(T)[] upper = "0123456789ABCDEF";
251 
252     alias _FormatterInfo!(immutable(T)) Info;
253 
254     static immutable Info[] formats = [
255         { 10, null, lower},
256         { -10, "-" , lower},
257         { 10, " " , lower},
258         { 10, "+" , lower},
259         {  2, "0b", lower},
260         {  8, "0o", lower},
261         { 16, "0x", lower},
262         { 16, "0X", upper},
263     ];
264 
265     ubyte index;
266     int len = cast(int) dst.length;
267 
268     if (len)
269     {
270         switch (type)
271         {
272             case 'd':
273             case 'D':
274             case 'g':
275             case 'G':
276                 if (i < 0)
277                     index = 1;
278                 else
279                     if (pre is ' ')
280                         index = 2;
281                     else
282                         if (pre is '+')
283                             index = 3;
284                 goto case;
285             case 'u':
286             case 'U':
287                 pre = '#';
288                 break;
289 
290             case 'b':
291             case 'B':
292                 index = 4;
293                 break;
294 
295             case 'o':
296             case 'O':
297                 index = 5;
298                 break;
299 
300             case 'x':
301                 index = 6;
302                 break;
303 
304             case 'X':
305                 index = 7;
306                 break;
307 
308             default:
309                 return cast(T[])"{unknown format '"~cast(T)type~"'}";
310         }
311 
312         auto info = &formats[index];
313         auto numbers = info.numbers;
314         auto radix = info.radix;
315 
316         // convert number to text
317         auto p = dst.ptr + len;
318 
319 
320         // Base 10 formatting
321         if (index <= 3 && index)
322         {
323             verify((i >= 0 && radix > 0) || (i < 0 && radix < 0));
324 
325             do
326                 *--p = numbers[abs(i % radix)];
327             while ((i /= radix) && --len);
328          }
329         else // Those numbers are not signed
330         {
331             ulong v = reinterpretInteger!(ulong)(i);
332             do
333                 *--p = numbers[v % radix];
334             while ((v /= radix) && --len);
335         }
336 
337         auto prefix = (pre is '#') ? info.prefix : null;
338         if (len > prefix.length)
339         {
340             len -= prefix.length + 1;
341 
342             // prefix number with zeros?
343             if (width)
344             {
345                 width = cast(int) (dst.length - width - prefix.length);
346                 while (len > width && len > 0)
347                 {
348                     *--p = '0';
349                     --len;
350                 }
351             }
352             // write optional prefix string ...
353             dst [len .. len + prefix.length] = prefix;
354 
355             // return slice of provided output buffer
356             return dst [len .. $];
357         }
358     }
359 
360     return "{output width too small}";
361 }
362 
363 /******************************************************************************
364 
365   Parse an integer value from the provided 'digits' string.
366 
367   The string is inspected for a sign and an optional radix
368   prefix. A radix may be provided as an argument instead,
369   whereupon it must match the prefix (where present). When
370   radix is set to zero, conversion will default to decimal.
371 
372   A non-null 'ate' will return the number of characters used
373   to construct the returned value.
374 
375 Throws: none. The 'ate' param should be checked for valid input.
376 
377  ******************************************************************************/
378 
379 long parse(T) (T[] digits, uint radix=0, uint* ate=null)
380 {
381     bool sign;
382 
383     auto eaten = trim (digits, sign, radix);
384     auto value = convert (digits[eaten..$], radix, ate);
385 
386     // check *ate > 0 to make sure we don't parse "-" as 0.
387     if (ate && *ate > 0)
388         *ate += eaten;
389 
390     return cast(long) (sign ? -value : value);
391 }
392 
393 /******************************************************************************
394 
395   Convert the provided 'digits' into an integer value,
396   without checking for a sign or radix. The radix defaults
397   to decimal (10).
398 
399   Returns the value and updates 'ate' with the number of
400   characters consumed.
401 
402 Throws: none. The 'ate' param should be checked for valid input.
403 
404  ******************************************************************************/
405 
406 ulong convert(T) (T[] digits, uint radix=10, uint* ate=null)
407 {
408     uint  eaten;
409     ulong value;
410 
411     foreach (Unqual!(T) c; digits)
412     {
413         if (c >= '0' && c <= '9')
414         {}
415         else
416             if (c >= 'a' && c <= 'z')
417                 c -= 39;
418             else
419                 if (c >= 'A' && c <= 'Z')
420                     c -= 7;
421                 else
422                     break;
423 
424         if ((c -= '0') < radix)
425         {
426             value = value * radix + c;
427             ++eaten;
428         }
429         else
430             break;
431     }
432 
433     if (ate)
434         *ate = eaten;
435 
436     return value;
437 }
438 
439 /******************************************************************************
440 
441   Strip leading whitespace, extract an optional +/- sign,
442   and an optional radix prefix. If the radix value matches
443   an optional prefix, or the radix is zero, the prefix will
444   be consumed and assigned. Where the radix is non zero and
445   does not match an explicit prefix, the latter will remain
446   unconsumed. Otherwise, radix will default to 10.
447 
448   Returns the number of characters consumed.
449 
450  ******************************************************************************/
451 
452 uint trim(T) (T[] digits, ref bool sign, ref uint radix)
453 {
454     Unqual!(T) c;
455     auto       p = digits.ptr;
456     auto       len = digits.length;
457 
458     if (len)
459     {
460         // strip off whitespace and sign characters
461         for (c = *p; len; c = *++p, --len)
462             if (c is ' ' || c is '\t')
463             {}
464             else
465                 if (c is '-')
466                     sign = true;
467                 else
468                     if (c is '+')
469                         sign = false;
470                     else
471                         break;
472 
473         // strip off a radix specifier also?
474         auto r = radix;
475         if (c is '0' && len > 1)
476         {
477             switch (*++p)
478             {
479                 case 'x':
480                 case 'X':
481                     ++p;
482                     r = 16;
483                     break;
484 
485                 case 'b':
486                 case 'B':
487                     ++p;
488                     r = 2;
489                     break;
490 
491                 case 'o':
492                 case 'O':
493                     ++p;
494                     r = 8;
495                     break;
496 
497                 default:
498                     --p;
499                     break;
500             }
501         }
502 
503         // default the radix to 10
504         if (r is 0)
505             radix = 10;
506         else
507         {
508             // explicit radix must match (optional) prefix
509             if (radix != r)
510             {
511                 if (radix)
512                     p -= 2;
513                 else
514                     radix = r;
515             }
516         }
517     }
518 
519     // return number of characters eaten
520     auto charcount = (p - digits.ptr);
521     assert(charcount >= 0);
522     return cast(uint) charcount;
523 }
524 
525 /******************************************************************************
526 
527   quick & dirty text-to-unsigned int converter. Use only when you
528   know what the content is, or use parse() or convert() instead.
529 
530   Return the parsed uint
531 
532  ******************************************************************************/
533 
534 uint atoi(T) (T[] s, int radix = 10)
535 {
536     uint value;
537 
538     foreach (c; s)
539         if (c >= '0' && c <= '9')
540             value = value * radix + (c - '0');
541         else
542             break;
543     return value;
544 }
545 
546 
547 /******************************************************************************
548 
549   quick & dirty unsigned to text converter, where the provided output
550   must be large enough to house the result (10 digits in the largest
551   case). For mainstream use, consider utilizing format() instead.
552 
553   Returns a populated slice of the provided output
554 
555  ******************************************************************************/
556 
557 T[] itoa(T) (T[] output, uint value, int radix = 10)
558 {
559     T* p = output.ptr + output.length;
560 
561     do {
562         *--p = cast(T)(value % radix + '0');
563     } while (value /= radix);
564     return output[cast(size_t) (p-output.ptr) .. $];
565 }
566 
567 /******************************************************************************
568 
569   Consume a number from the input without converting it. Argument
570   'fp' enables floating-point consumption. Supports hex input for
571   numbers which are prefixed appropriately
572 
573   Since version 0.99.9
574 
575  ******************************************************************************/
576 
577 T[] consume(T) (T[] src, bool fp=false)
578 {
579     Unqual!(T) c;
580     bool       sign;
581     uint       radix;
582 
583     // remove leading space, and sign
584     auto e = src.ptr + src.length;
585     auto p = src.ptr + trim (src, sign, radix);
586     auto b = p;
587 
588     // bail out if the string is empty
589     if (src.length is 0 || p > &src[$-1])
590         return null;
591 
592     // read leading digits
593     for (c=*p; p < e && ((c >= '0' && c <= '9') ||
594                 (radix is 16 && ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))));)
595         c = *++p;
596 
597     if (fp)
598     {
599         // gobble up a point
600         if (c is '.' && p < e)
601             c = *++p;
602 
603         // read fractional digits
604         while (c >= '0' && c <= '9' && p < e)
605             c = *++p;
606 
607         // did we consume anything?
608         if (p > b)
609         {
610             // consume exponent?
611             if ((c is 'e' || c is 'E') && p < e )
612             {
613                 c = *++p;
614                 if (c is '+' || c is '-')
615                     c = *++p;
616                 while (c >= '0' && c <= '9' && p < e)
617                     c = *++p;
618             }
619         }
620     }
621     return src [0 .. p-src.ptr];
622 }
623 
624 /*******************************************************************************
625 
626     Get the absolute value of a number
627 
628     The number should not be == `T.min` if `T` is a signed number.
629     Since signed numbers use the two's complement, `-T.min` cannot be
630     represented: It would be `T.max + 1`.
631     Trying to calculate `-T.min` causes an integer overflow and results in
632     `T.min`.
633 
634     Params:
635         x = A value between `T.min` (exclusive for signed number) and `T.max`
636 
637     Returns:
638         The absolute value of `x` (`|x|`)
639 
640 *******************************************************************************/
641 
642 private T abs (T) (T x)
643 {
644     static if (T.min < 0)
645     {
646         verify(x != T.min,
647             "abs cannot be called with x == " ~ T.stringof ~ ".min");
648     }
649     return x >= 0 ? x : -x;
650 }
651 
652 
653 /*******************************************************************************
654 
655     Truncates or zero-extend a value of type `From` to fit into `To`.
656 
657     Getting the same binary representation of a number in a larger type can be
658     quite tedious, especially when it comes to negative numbers.
659     For example, turning `byte(-1)` into `long` or `ulong` gives different
660     result.
661     This functions allows to get the same exact binary representation of an
662     integral type into another. If the representation is truncating, it is
663     just a cast. If it is widening, it zero extends `val`.
664 
665     Params:
666         To      = Type to convert to
667         From    = Type to convert from. If not specified, it is infered from
668                   val, so it will be an `int` when passing a literal.
669         val     = Value to reinterpret
670 
671     Returns:
672         Binary representation of `val` typed as `To`
673 
674 *******************************************************************************/
675 
676 private To reinterpretInteger (To, From) (From val)
677 {
678     static if (From.sizeof >= To.sizeof)
679         return cast(To) val;
680     else
681     {
682         static struct Reinterpreter
683         {
684             version (LittleEndian) From value;
685             // 0 padding
686             ubyte[To.sizeof - From.sizeof] pad;
687             version (BigEndian) From value;
688         }
689 
690         Reinterpreter r = { value: val };
691         return *(cast(To*) &r.value);
692     }
693 }
694 
695 
696 /******************************************************************************
697 
698  ******************************************************************************/
699 
700 debug (Integer)
701 {
702     import ocean.io.Stdout;
703 
704     void main()
705     {
706         char[8] tmp;
707 
708         Stdout.formatln ("d '{}'", format(tmp, 10));
709         Stdout.formatln ("d '{}'", format(tmp, -10));
710 
711         Stdout.formatln ("u '{}'", format(tmp, 10L, "u"));
712         Stdout.formatln ("U '{}'", format(tmp, 10L, "U"));
713         Stdout.formatln ("g '{}'", format(tmp, 10L, "g"));
714         Stdout.formatln ("G '{}'", format(tmp, 10L, "G"));
715         Stdout.formatln ("o '{}'", format(tmp, 10L, "o"));
716         Stdout.formatln ("O '{}'", format(tmp, 10L, "O"));
717         Stdout.formatln ("b '{}'", format(tmp, 10L, "b"));
718         Stdout.formatln ("B '{}'", format(tmp, 10L, "B"));
719         Stdout.formatln ("x '{}'", format(tmp, 10L, "x"));
720         Stdout.formatln ("X '{}'", format(tmp, 10L, "X"));
721 
722         Stdout.formatln ("d+ '{}'", format(tmp, 10L, "d+"));
723         Stdout.formatln ("ds '{}'", format(tmp, 10L, "d "));
724         Stdout.formatln ("d# '{}'", format(tmp, 10L, "d#"));
725         Stdout.formatln ("x# '{}'", format(tmp, 10L, "x#"));
726         Stdout.formatln ("X# '{}'", format(tmp, 10L, "X#"));
727         Stdout.formatln ("b# '{}'", format(tmp, 10L, "b#"));
728         Stdout.formatln ("o# '{}'", format(tmp, 10L, "o#"));
729 
730         Stdout.formatln ("d1 '{}'", format(tmp, 10L, "d1"));
731         Stdout.formatln ("d8 '{}'", format(tmp, 10L, "d8"));
732         Stdout.formatln ("x8 '{}'", format(tmp, 10L, "x8"));
733         Stdout.formatln ("X8 '{}'", format(tmp, 10L, "X8"));
734         Stdout.formatln ("b8 '{}'", format(tmp, 10L, "b8"));
735         Stdout.formatln ("o8 '{}'", format(tmp, 10L, "o8"));
736 
737         Stdout.formatln ("d1# '{}'", format(tmp, 10L, "d1#"));
738         Stdout.formatln ("d6# '{}'", format(tmp, 10L, "d6#"));
739         Stdout.formatln ("x6# '{}'", format(tmp, 10L, "x6#"));
740         Stdout.formatln ("X6# '{}'", format(tmp, 10L, "X6#"));
741 
742         Stdout.formatln ("b12# '{}'", format(tmp, 10L, "b12#"));
743         Stdout.formatln ("o12# '{}'", format(tmp, 10L, "o12#")).newline;
744 
745         Stdout.formatln (consume("10"));
746         Stdout.formatln (consume("0x1f"));
747         Stdout.formatln (consume("0.123"));
748         Stdout.formatln (consume("0.123", true));
749         Stdout.formatln (consume("0.123e-10", true)).newline;
750 
751         Stdout.formatln (consume("10  s"));
752         Stdout.formatln (consume("0x1f   s"));
753         Stdout.formatln (consume("0.123  s"));
754         Stdout.formatln (consume("0.123  s", true));
755         Stdout.formatln (consume("0.123e-10  s", true)).newline;
756     }
757 }