tinyendian/source/tinyendian.d

//          Copyright Ferdinand Majerech 2014.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

/// A minimal library providing functionality for changing the endianness of data.
module tinyendian;

import std.system;
import std.utf;

static if(__VERSION__ < 2066)
    private enum nogc;

/// Unicode UTF encodings.
enum UTFEncoding : ubyte
{
    UTF_8,
    UTF_16,
    UTF_32
}
///
@safe unittest
{
    const ints = [314, -101];
    int[2] intsSwapBuffer = ints;
    swapByteOrder(intsSwapBuffer[]);
    swapByteOrder(intsSwapBuffer[]);
    assert(ints == intsSwapBuffer, "Lost information when swapping byte order");

    const floats = [3.14f, 10.1f];
    float[2] floatsSwapBuffer = floats;
    swapByteOrder(floatsSwapBuffer[]);
    swapByteOrder(floatsSwapBuffer[]);
    assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");
}

/** Swap byte order of items in an array in place.
 *
 * Params:
 *
 * T     = Item type. Must be either 2 or 4 bytes long.
 * array = Buffer with values to fix byte order of.
 */
void swapByteOrder(T)(T[] array) @trusted @nogc pure nothrow
if (T.sizeof == 2 || T.sizeof == 4)
{
    // Swap the byte order of all read characters.
    foreach (ref item; array)
    {
        static if (T.sizeof == 2)
        {
            import std.algorithm.mutation : swap;
            swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
        }
        else static if (T.sizeof == 4)
        {
            import core.bitop : bswap;
            const swapped = bswap(*cast(uint*)&item);
            item = *cast(const(T)*)&swapped;
        }
        else static assert(false, "Unsupported T: " ~ T.stringof);
    }
}

/// See fixUTFByteOrder.
struct FixUTFByteOrderResult
{
    ubyte[] array;
    UTFEncoding encoding;
    Endian endian;
    uint bytesStripped = 0;
}

/** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place.
 *
 * Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
 * at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
 * BOM, if any, will be removed from the buffer.
 *
 * If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
 * for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
 * 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
 *
 * Note that this function does $(B not) check if the array is a valid UTF string. It
 * only works with the BOM and 1,2 or 4-byte items.
 *
 * Params:
 *
 * array = The array with UTF-data.
 *
 * Returns:
 *
 * A struct with the following members:
 *
 * $(D ubyte[] array)            A slice of the input array containing data in correct
 *                               byte order, without BOM and in case of UTF-16/UTF-32,
 *                               without stripped bytes, if any.
 * $(D UTFEncoding encoding)     Encoding of the result (UTF-8, UTF-16 or UTF-32)
 * $(D std.system.Endian endian) Endianness of the original array.
 * $(D uint bytesStripped)       Number of bytes stripped from a UTF-16/UTF-32 array, if
 *                               any. This is non-zero only if array.length was not
 *                               divisible by 2 or 4 for UTF-16 and UTF-32, respectively.
 *
 * Complexity: (BIGOH array.length)
 */
auto fixUTFByteOrder(ubyte[] array) @safe @nogc pure nothrow
{
    // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
    enum BOM: ubyte
    {
        UTF_8     = 0,
        UTF_16_LE = 1,
        UTF_16_BE = 2,
        UTF_32_LE = 3,
        UTF_32_BE = 4,
        None      = ubyte.max
    }

    // These 2 are from std.stream
    static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
                                                   [0xFF, 0xFE],
                                                   [0xFE, 0xFF],
                                                   [0xFF, 0xFE, 0x00, 0x00],
                                                   [0x00, 0x00, 0xFE, 0xFF] ];
    static immutable Endian[5] bomEndian = [ std.system.endian,
                                             Endian.littleEndian,
                                             Endian.bigEndian,
                                             Endian.littleEndian, 
                                             Endian.bigEndian ];

    // Documented in function ddoc.

    FixUTFByteOrderResult result;

    // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
    // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
    // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
    import std.algorithm.searching : startsWith;
    BOM bomId = BOM.None;
    foreach (i, bom; byteOrderMarks)
        if (array.startsWith(bom))
            bomId = cast(BOM)i;

    result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;

    // Start of UTF data (after BOM, if any)
    size_t start = 0;
    // If we've read more than just the BOM, put the rest into the array.
    with(BOM) final switch(bomId)
    {
        case None: result.encoding = UTFEncoding.UTF_8; break;
        case UTF_8:
            start = 3;
            result.encoding = UTFEncoding.UTF_8;
            break;
        case UTF_16_LE, UTF_16_BE:
            result.bytesStripped = array.length % 2;
            start = 2;
            result.encoding = UTFEncoding.UTF_16;
            break;
        case UTF_32_LE, UTF_32_BE:
            result.bytesStripped = array.length % 4;
            start = 4;
            result.encoding = UTFEncoding.UTF_32;
            break;
    }

    // If there's a BOM, we need to move data back to ensure it starts at array[0]
    if (start != 0)
    {
        array = array[start .. $  - result.bytesStripped];
    }

    // We enforce above that array.length is divisible by 2/4 for UTF-16/32
    if (std.system.endian != result.endian)
    {
        if (result.encoding == UTFEncoding.UTF_16)
            swapByteOrder(cast(wchar[])array);
        else if (result.encoding == UTFEncoding.UTF_32)
            swapByteOrder(cast(dchar[])array);
    }

    result.array = array;
    return result;
}
///
@safe unittest
{
    {
        ubyte[] s = [0xEF, 0xBB, 0xBF, 'a'];
        FixUTFByteOrderResult r = fixUTFByteOrder(s);
        assert(r.encoding == UTFEncoding.UTF_8);
        assert(r.array.length == 1);
        assert(r.array == ['a']);
        assert(r.endian == Endian.littleEndian);
    }

    {
        ubyte[] s = ['a'];
        FixUTFByteOrderResult r = fixUTFByteOrder(s);
        assert(r.encoding == UTFEncoding.UTF_8);
        assert(r.array.length == 1);
        assert(r.array == ['a']);
        assert(r.endian == Endian.bigEndian);
    }

    {
        // strip 'a' b/c not complete unit
        ubyte[] s = [0xFE, 0xFF, 'a'];
        FixUTFByteOrderResult r = fixUTFByteOrder(s);
        assert(r.encoding == UTFEncoding.UTF_16);
        assert(r.array.length == 0);
        assert(r.endian == Endian.bigEndian);
    }

}
Initial commit. 2014-08-06 13:35:59 +00:00			`// Copyright Ferdinand Majerech 2014.`
			`// Distributed under the Boost Software License, Version 1.0.`
			`// (See accompanying file LICENSE_1_0.txt or copy at`
			`// http://www.boost.org/LICENSE_1_0.txt)`

			`/// A minimal library providing functionality for changing the endianness of data.`
			`module tinyendian;`

			`import std.system;`
			`import std.utf;`

try to get it compiling on pre 2066 2015-03-28 10:06:47 +00:00			`static if(__VERSION__ < 2066)`
			`private enum nogc;`
Initial commit. 2014-08-06 13:35:59 +00:00
			`/// Unicode UTF encodings.`
			`enum UTFEncoding : ubyte`
			`{`
			`UTF_8,`
			`UTF_16,`
			`UTF_32`
			`}`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`///`
			`@safe unittest`
Fixed floating point byte swapping. 2014-11-21 22:09:45 +00:00			`{`
			`const ints = [314, -101];`
			`int[2] intsSwapBuffer = ints;`
			`swapByteOrder(intsSwapBuffer[]);`
			`swapByteOrder(intsSwapBuffer[]);`
			`assert(ints == intsSwapBuffer, "Lost information when swapping byte order");`

			`const floats = [3.14f, 10.1f];`
			`float[2] floatsSwapBuffer = floats;`
			`swapByteOrder(floatsSwapBuffer[]);`
			`swapByteOrder(floatsSwapBuffer[]);`
			`assert(floats == floatsSwapBuffer, "Lost information when swapping byte order");`
			`}`

Doc comment format change. 2015-02-16 12:52:28 +00:00			`/** Swap byte order of items in an array in place.`
			`*`
			`* Params:`
			`*`
			`* T = Item type. Must be either 2 or 4 bytes long.`
			`* array = Buffer with values to fix byte order of.`
			`*/`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`void swapByteOrder(T)(T[] array) @trusted @nogc pure nothrow`
			`if (T.sizeof == 2 \|\| T.sizeof == 4)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
			`// Swap the byte order of all read characters.`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`foreach (ref item; array)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`static if (T.sizeof == 2)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`import std.algorithm.mutation : swap;`
Initial commit. 2014-08-06 13:35:59 +00:00			`swap(cast(ubyte)&item, (cast(ubyte)&item + 1));`
			`}`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`else static if (T.sizeof == 4)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`import core.bitop : bswap;`
Fixed floating point byte swapping. 2014-11-21 22:09:45 +00:00			`const swapped = bswap(cast(uint)&item);`
			`item = cast(const(T))&swapped;`
Initial commit. 2014-08-06 13:35:59 +00:00			`}`
			`else static assert(false, "Unsupported T: " ~ T.stringof);`
			`}`
			`}`

make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`/// See fixUTFByteOrder.`
			`struct FixUTFByteOrderResult`
			`{`
			`ubyte[] array;`
			`UTFEncoding encoding;`
			`Endian endian;`
			`uint bytesStripped = 0;`
			`}`

Doc comment format change. 2015-02-16 12:52:28 +00:00			`/** Convert byte order of an array encoded in UTF(8/16/32) to system endianness in place.`
			`*`
			`* Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM`
			`* at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The`
			`* BOM, if any, will be removed from the buffer.`
			`*`
			`* If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes`
			`* for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by`
			`* 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.`
			`*`
			`* Note that this function does $(B not) check if the array is a valid UTF string. It`
			`* only works with the BOM and 1,2 or 4-byte items.`
			`*`
			`* Params:`
			`*`
			`* array = The array with UTF-data.`
			`*`
			`* Returns:`
			`*`
			`* A struct with the following members:`
			`*`
			`* $(D ubyte[] array) A slice of the input array containing data in correct`
			`* byte order, without BOM and in case of UTF-16/UTF-32,`
			`* without stripped bytes, if any.`
			`* $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32)`
			`* $(D std.system.Endian endian) Endianness of the original array.`
			`* $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, if`
			`* any. This is non-zero only if array.length was not`
			`* divisible by 2 or 4 for UTF-16 and UTF-32, respectively.`
			`*`
			`* Complexity: (BIGOH array.length)`
			`*/`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`auto fixUTFByteOrder(ubyte[] array) @safe @nogc pure nothrow`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
			`// Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.`
			`enum BOM: ubyte`
			`{`
			`UTF_8 = 0,`
			`UTF_16_LE = 1,`
			`UTF_16_BE = 2,`
			`UTF_32_LE = 3,`
			`UTF_32_BE = 4,`
			`None = ubyte.max`
			`}`

			`// These 2 are from std.stream`
			`static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],`
			`[0xFF, 0xFE],`
			`[0xFE, 0xFF],`
			`[0xFF, 0xFE, 0x00, 0x00],`
			`[0x00, 0x00, 0xFE, 0xFF] ];`
			`static immutable Endian[5] bomEndian = [ std.system.endian,`
			`Endian.littleEndian,`
			`Endian.bigEndian,`
			`Endian.littleEndian,`
			`Endian.bigEndian ];`

			`// Documented in function ddoc.`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00
			`FixUTFByteOrderResult result;`
Initial commit. 2014-08-06 13:35:59 +00:00
			`// Detect BOM, if any, in the bytes we've read. -1 means no BOM.`
			`// Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we`
			`// used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`import std.algorithm.searching : startsWith;`
Initial commit. 2014-08-06 13:35:59 +00:00			`BOM bomId = BOM.None;`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`foreach (i, bom; byteOrderMarks)`
			`if (array.startsWith(bom))`
			`bomId = cast(BOM)i;`
Initial commit. 2014-08-06 13:35:59 +00:00
			`result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;`

			`// Start of UTF data (after BOM, if any)`
			`size_t start = 0;`
			`// If we've read more than just the BOM, put the rest into the array.`
			`with(BOM) final switch(bomId)`
			`{`
			`case None: result.encoding = UTFEncoding.UTF_8; break;`
			`case UTF_8:`
			`start = 3;`
			`result.encoding = UTFEncoding.UTF_8;`
			`break;`
			`case UTF_16_LE, UTF_16_BE:`
			`result.bytesStripped = array.length % 2;`
			`start = 2;`
			`result.encoding = UTFEncoding.UTF_16;`
			`break;`
			`case UTF_32_LE, UTF_32_BE:`
			`result.bytesStripped = array.length % 4;`
			`start = 4;`
			`result.encoding = UTFEncoding.UTF_32;`
			`break;`
			`}`

			`// If there's a BOM, we need to move data back to ensure it starts at array[0]`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`if (start != 0)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`array = array[start .. $ - result.bytesStripped];`
Initial commit. 2014-08-06 13:35:59 +00:00			`}`

			`// We enforce above that array.length is divisible by 2/4 for UTF-16/32`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`if (std.system.endian != result.endian)`
Initial commit. 2014-08-06 13:35:59 +00:00			`{`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`if (result.encoding == UTFEncoding.UTF_16)`
			`swapByteOrder(cast(wchar[])array);`
			`else if (result.encoding == UTFEncoding.UTF_32)`
			`swapByteOrder(cast(dchar[])array);`
Initial commit. 2014-08-06 13:35:59 +00:00			`}`

			`result.array = array;`
			`return result;`
			`}`
make `@safe`, improve coverage and refresh the style 2018-06-10 08:25:34 +00:00			`///`
			`@safe unittest`
			`{`
			`{`
			`ubyte[] s = [0xEF, 0xBB, 0xBF, 'a'];`
			`FixUTFByteOrderResult r = fixUTFByteOrder(s);`
			`assert(r.encoding == UTFEncoding.UTF_8);`
			`assert(r.array.length == 1);`
			`assert(r.array == ['a']);`
			`assert(r.endian == Endian.littleEndian);`
			`}`

			`{`
			`ubyte[] s = ['a'];`
			`FixUTFByteOrderResult r = fixUTFByteOrder(s);`
			`assert(r.encoding == UTFEncoding.UTF_8);`
			`assert(r.array.length == 1);`
			`assert(r.array == ['a']);`
			`assert(r.endian == Endian.bigEndian);`
			`}`

			`{`
			`// strip 'a' b/c not complete unit`
			`ubyte[] s = [0xFE, 0xFF, 'a'];`
			`FixUTFByteOrderResult r = fixUTFByteOrder(s);`
			`assert(r.encoding == UTFEncoding.UTF_16);`
			`assert(r.array.length == 0);`
			`assert(r.endian == Endian.bigEndian);`
			`}`

			`}`