Initial commit.

2014-08-06 15:35:59 +02:00 · 2014-08-06 15:35:59 +02:00 · 93c5c3834c
commit 93c5c3834c
4 changed files with 313 additions and 0 deletions
--- a/LICENSE_1_0.txt
+++ b/LICENSE_1_0.txt
@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,104 @@
+==========
+TinyEndian
+==========
+
+------------
+Introduction
+------------
+
+TinyEndian is a minimal endianness library for the D programming language.  It has no
+external dependencies, it only needs a D compiler and Phobos (standard library).
+TinyEndian doesn't allocate memory and is fully **@nogc** to allow use in
+high-performance code.
+
+The API is not stable and may change in the future.
+
+--------
+Features
+--------
+
+* Swap byte order of 2- or 4-byte elements in an array in place.
+* Read a UTF-8, UTF-16 or UTF-32 buffer, determine its endianness using a UTF
+  byte-order-mark and convert it to system endianness in place.
+* No external dependencies.
+* pure, nothrow and @nogc.
+
+-------------------
+Directory structure
+-------------------
+
+===============  =======================================================================
+Directory        Contents
+===============  =======================================================================
+``./``           This README file, utility scripts, D:YAML sources outside any packages.
+``./source``     Source code.
+===============  =======================================================================
+
+
+-----
+Usage
+-----
+
+Assuming you use `dub <http://code.dlang.org/about>`_, add this line::
+
+   "tinyendian": { "version" : "~>0.1.0" }
+
+to the ``"dependencies"`` in your project's ``dub.json``.
+
+If you don't use dub, you can directly copy the ``source/tinyendian.d`` file into your
+project.
+
+TinyEndian requires DMD 2.066 or equivalent GDC/LDC.
+
+Open ``source/tinyendian.d`` to read the API documentation.
+
+
+-------
+License
+-------
+
+TinyEndian is released under the terms of the
+`Boost Software License 1.0 <http://www.boost.org/LICENSE_1_0.txt>`_.
+This license allows you to use the source code in your own projects, open source
+or proprietary, and to modify it to suit your needs. However, in source
+distributions, you have to preserve the license headers in the source code and
+the accompanying license file.
+
+Full text of the license can be found in file ``LICENSE_1_0.txt`` and is also
+displayed here::
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+
+-------
+Credits
+-------
+
+TinyEndian was created by Ferdinand Majerech aka Kiith-Sa kiithsacmp[AT]gmail.com .
+
+Parts of code based on the ``std.stream`` Phobos module.
+
+D:YAML was created using Vim and DMD Linux Mint as a YAML parsing library for the `D
+programming language <http://www.dlang.org>`_.
--- a/package.json
+++ b/package.json
@ -0,0 +1,16 @@
+{
+    "name": "tinyendian",
+    "description": "Lightweight endianness handling library",
+    "authors": [ "Ferdinand Majerech" ],
+    "importPaths": ["source"],
+    "license": "Boost 1.0",
+    "homepage": "https://github.com/kiith-sa/tinyendian",
+    "copyright": "Copyright © 2014, Ferdinand Majerech",
+
+    "buildTypes":
+    {
+        "debug": { "buildOptions": ["debugMode", "debugInfoC"] },
+        "release": { "buildOptions": ["releaseMode", "optimize", "inline", "noBoundsCheck"] },
+        "profile": { "buildOptions": ["releaseMode", "optimize", "noBoundsCheck", "debugInfoC"] },
+    },
+}
--- a/source/tinyendian.d
+++ b/source/tinyendian.d
@ -0,0 +1,170 @@
+//          Copyright Ferdinand Majerech 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+/// A minimal library providing functionality for changing the endianness of data.
+module tinyendian;
+
+
+import core.stdc.string;
+
+import std.algorithm;
+import std.system;
+import std.utf;
+
+
+/// Unicode UTF encodings.
+enum UTFEncoding : ubyte
+{
+    UTF_8,
+    UTF_16,
+    UTF_32
+}
+
+@system pure nothrow @nogc:
+
+/// Swap byte order of items in an array in place.
+///
+/// Params:
+///
+/// T     = Item type. Must be either 2 or 4 bytes long.
+/// array = Buffer with values to fix byte order of.
+void swapByteOrder(T)(T[] array)
+    if([2, 4].canFind(T.sizeof))
+{
+    import core.bitop;
+    // Swap the byte order of all read characters.
+    foreach(ref item; array)
+    {
+        static if(T.sizeof == 2)
+        {
+            swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1));
+        }
+        else static if(T.sizeof == 4)
+        {
+            item = bswap(cast(uint)item);
+        }
+        else static assert(false, "Unsupported T: " ~ T.stringof);
+    }
+}
+
+/// Convert byte order of an array encoded in UTF(8/16/32) to system endianness in
+/// place.
+///
+/// Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM
+/// at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The
+/// BOM, if any, will be removed from the buffer.
+///
+/// If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes
+/// for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by
+/// 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped.
+///
+/// Note that this function does $(B not) check if the array is a valid UTF string. It
+/// only works with the BOM and 1,2 or 4-byte items.
+///
+/// Params:
+///
+/// array = The array with UTF-data.
+///
+/// Returns:
+///
+/// A struct with the following members:
+///
+/// $(D ubyte[] array)            A slice of the input array containing data in correct
+///                               byte order, without BOM and in case of UTF-16/UTF-32,
+///                               without stripped bytes, if any.
+/// $(D UTFEncoding encoding)     Encoding of the result (UTF-8, UTF-16 or UTF-32)
+/// $(D std.system.Endian endian) Endianness of the original array.
+/// $(D uint bytesStripped)       Number of bytes stripped from a UTF-16/UTF-32 array,
+///                               if any. This is non-zero only if array.length was not
+///                               divisible by 2 or 4 for UTF-16 and UTF-32,
+///                               respectively.
+///
+/// Complexity: (BIGOH array.length)
+auto fixUTFByteOrder(ubyte[] array)
+{
+    // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian.
+    enum BOM: ubyte
+    {
+        UTF_8     = 0,
+        UTF_16_LE = 1,
+        UTF_16_BE = 2,
+        UTF_32_LE = 3,
+        UTF_32_BE = 4,
+        None      = ubyte.max
+    }
+
+    // These 2 are from std.stream
+    static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF],
+                                                   [0xFF, 0xFE],
+                                                   [0xFE, 0xFF],
+                                                   [0xFF, 0xFE, 0x00, 0x00],
+                                                   [0x00, 0x00, 0xFE, 0xFF] ];
+    static immutable Endian[5] bomEndian = [ std.system.endian,
+                                             Endian.littleEndian,
+                                             Endian.bigEndian,
+                                             Endian.littleEndian, 
+                                             Endian.bigEndian ];
+
+    // Documented in function ddoc.
+    struct Result
+    {
+        ubyte[] array;
+        UTFEncoding encoding;
+        Endian endian;
+        uint bytesStripped = 0;
+    }
+    Result result;
+
+    // Detect BOM, if any, in the bytes we've read. -1 means no BOM.
+    // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we
+    // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM.
+    BOM bomId = BOM.None;
+    foreach(i, bom; byteOrderMarks) if(array.startsWith(bom))
+    {
+        bomId = cast(BOM)i;
+    }
+
+    result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init;
+
+    // Start of UTF data (after BOM, if any)
+    size_t start = 0;
+    // If we've read more than just the BOM, put the rest into the array.
+    with(BOM) final switch(bomId)
+    {
+        case None: result.encoding = UTFEncoding.UTF_8; break;
+        case UTF_8:
+            start = 3;
+            result.encoding = UTFEncoding.UTF_8;
+            break;
+        case UTF_16_LE, UTF_16_BE:
+            result.bytesStripped = array.length % 2;
+            start = 2;
+            result.encoding = UTFEncoding.UTF_16;
+            break;
+        case UTF_32_LE, UTF_32_BE:
+            result.bytesStripped = array.length % 4;
+            start = 4;
+            result.encoding = UTFEncoding.UTF_32;
+            break;
+    }
+
+    array = array[0 .. $ - result.bytesStripped];
+    // If there's a BOM, we need to move data back to ensure it starts at array[0]
+    if(start != 0)
+    {
+        core.stdc.string.memmove(array.ptr, array.ptr + start, array.length - start);
+        array = array[0 .. $ - start];
+    }
+
+    // We enforce above that array.length is divisible by 2/4 for UTF-16/32
+    if(std.system.endian != result.endian)
+    {
+        if(result.encoding == UTFEncoding.UTF_16)      { swapByteOrder(cast(wchar[])array); }
+        else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); }
+    }
+
+    result.array = array;
+    return result;
+}