From 93c5c3834ca6411b139b328ebffe556e7ebb3358 Mon Sep 17 00:00:00 2001 From: Ferdinand Majerech Date: Wed, 6 Aug 2014 15:35:59 +0200 Subject: [PATCH] Initial commit. --- LICENSE_1_0.txt | 23 ++++++ README.rst | 104 +++++++++++++++++++++++++++ package.json | 16 +++++ source/tinyendian.d | 170 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 313 insertions(+) create mode 100644 LICENSE_1_0.txt create mode 100644 README.rst create mode 100644 package.json create mode 100644 source/tinyendian.d diff --git a/LICENSE_1_0.txt b/LICENSE_1_0.txt new file mode 100644 index 0000000..36b7cd9 --- /dev/null +++ b/LICENSE_1_0.txt @@ -0,0 +1,23 @@ +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..4e267b0 --- /dev/null +++ b/README.rst @@ -0,0 +1,104 @@ +========== +TinyEndian +========== + +------------ +Introduction +------------ + +TinyEndian is a minimal endianness library for the D programming language. It has no +external dependencies, it only needs a D compiler and Phobos (standard library). +TinyEndian doesn't allocate memory and is fully **@nogc** to allow use in +high-performance code. + +The API is not stable and may change in the future. + +-------- +Features +-------- + +* Swap byte order of 2- or 4-byte elements in an array in place. +* Read a UTF-8, UTF-16 or UTF-32 buffer, determine its endianness using a UTF + byte-order-mark and convert it to system endianness in place. +* No external dependencies. +* pure, nothrow and @nogc. + +------------------- +Directory structure +------------------- + +=============== ======================================================================= +Directory Contents +=============== ======================================================================= +``./`` This README file, utility scripts, D:YAML sources outside any packages. +``./source`` Source code. +=============== ======================================================================= + + +----- +Usage +----- + +Assuming you use `dub `_, add this line:: + + "tinyendian": { "version" : "~>0.1.0" } + +to the ``"dependencies"`` in your project's ``dub.json``. + +If you don't use dub, you can directly copy the ``source/tinyendian.d`` file into your +project. + +TinyEndian requires DMD 2.066 or equivalent GDC/LDC. + +Open ``source/tinyendian.d`` to read the API documentation. + + +------- +License +------- + +TinyEndian is released under the terms of the +`Boost Software License 1.0 `_. +This license allows you to use the source code in your own projects, open source +or proprietary, and to modify it to suit your needs. However, in source +distributions, you have to preserve the license headers in the source code and +the accompanying license file. + +Full text of the license can be found in file ``LICENSE_1_0.txt`` and is also +displayed here:: + + Boost Software License - Version 1.0 - August 17th, 2003 + + Permission is hereby granted, free of charge, to any person or organization + obtaining a copy of the software and accompanying documentation covered by + this license (the "Software") to use, reproduce, display, distribute, + execute, and transmit the Software, and to prepare derivative works of the + Software, and to permit third-parties to whom the Software is furnished to + do so, all subject to the following: + + The copyright notices in the Software and this entire statement, including + the above license grant, this restriction and the following disclaimer, + must be included in all copies of the Software, in whole or in part, and + all derivative works of the Software, unless such copies or derivative + works are solely in the form of machine-executable object code generated by + a source language processor. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT + SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE + FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + + +------- +Credits +------- + +TinyEndian was created by Ferdinand Majerech aka Kiith-Sa kiithsacmp[AT]gmail.com . + +Parts of code based on the ``std.stream`` Phobos module. + +D:YAML was created using Vim and DMD Linux Mint as a YAML parsing library for the `D +programming language `_. diff --git a/package.json b/package.json new file mode 100644 index 0000000..6313cdd --- /dev/null +++ b/package.json @@ -0,0 +1,16 @@ +{ + "name": "tinyendian", + "description": "Lightweight endianness handling library", + "authors": [ "Ferdinand Majerech" ], + "importPaths": ["source"], + "license": "Boost 1.0", + "homepage": "https://github.com/kiith-sa/tinyendian", + "copyright": "Copyright © 2014, Ferdinand Majerech", + + "buildTypes": + { + "debug": { "buildOptions": ["debugMode", "debugInfoC"] }, + "release": { "buildOptions": ["releaseMode", "optimize", "inline", "noBoundsCheck"] }, + "profile": { "buildOptions": ["releaseMode", "optimize", "noBoundsCheck", "debugInfoC"] }, + }, +} diff --git a/source/tinyendian.d b/source/tinyendian.d new file mode 100644 index 0000000..a3e33f7 --- /dev/null +++ b/source/tinyendian.d @@ -0,0 +1,170 @@ +// Copyright Ferdinand Majerech 2014. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +/// A minimal library providing functionality for changing the endianness of data. +module tinyendian; + + +import core.stdc.string; + +import std.algorithm; +import std.system; +import std.utf; + + +/// Unicode UTF encodings. +enum UTFEncoding : ubyte +{ + UTF_8, + UTF_16, + UTF_32 +} + +@system pure nothrow @nogc: + +/// Swap byte order of items in an array in place. +/// +/// Params: +/// +/// T = Item type. Must be either 2 or 4 bytes long. +/// array = Buffer with values to fix byte order of. +void swapByteOrder(T)(T[] array) + if([2, 4].canFind(T.sizeof)) +{ + import core.bitop; + // Swap the byte order of all read characters. + foreach(ref item; array) + { + static if(T.sizeof == 2) + { + swap(*cast(ubyte*)&item, *(cast(ubyte*)&item + 1)); + } + else static if(T.sizeof == 4) + { + item = bswap(cast(uint)item); + } + else static assert(false, "Unsupported T: " ~ T.stringof); + } +} + +/// Convert byte order of an array encoded in UTF(8/16/32) to system endianness in +/// place. +/// +/// Uses the UTF byte-order-mark (BOM) to determine UTF encoding. If there is no BOM +/// at the beginning of array, UTF-8 is assumed (this is compatible with ASCII). The +/// BOM, if any, will be removed from the buffer. +/// +/// If the encoding is determined to be UTF-16 or UTF-32 and there aren't enough bytes +/// for the last code unit (i.e. if array.length is odd for UTF-16 or not divisible by +/// 4 for UTF-32), the extra bytes (1 for UTF-16, 1-3 for UTF-32) are stripped. +/// +/// Note that this function does $(B not) check if the array is a valid UTF string. It +/// only works with the BOM and 1,2 or 4-byte items. +/// +/// Params: +/// +/// array = The array with UTF-data. +/// +/// Returns: +/// +/// A struct with the following members: +/// +/// $(D ubyte[] array) A slice of the input array containing data in correct +/// byte order, without BOM and in case of UTF-16/UTF-32, +/// without stripped bytes, if any. +/// $(D UTFEncoding encoding) Encoding of the result (UTF-8, UTF-16 or UTF-32) +/// $(D std.system.Endian endian) Endianness of the original array. +/// $(D uint bytesStripped) Number of bytes stripped from a UTF-16/UTF-32 array, +/// if any. This is non-zero only if array.length was not +/// divisible by 2 or 4 for UTF-16 and UTF-32, +/// respectively. +/// +/// Complexity: (BIGOH array.length) +auto fixUTFByteOrder(ubyte[] array) +{ + // Enumerates UTF BOMs, matching indices to byteOrderMarks/bomEndian. + enum BOM: ubyte + { + UTF_8 = 0, + UTF_16_LE = 1, + UTF_16_BE = 2, + UTF_32_LE = 3, + UTF_32_BE = 4, + None = ubyte.max + } + + // These 2 are from std.stream + static immutable ubyte[][5] byteOrderMarks = [ [0xEF, 0xBB, 0xBF], + [0xFF, 0xFE], + [0xFE, 0xFF], + [0xFF, 0xFE, 0x00, 0x00], + [0x00, 0x00, 0xFE, 0xFF] ]; + static immutable Endian[5] bomEndian = [ std.system.endian, + Endian.littleEndian, + Endian.bigEndian, + Endian.littleEndian, + Endian.bigEndian ]; + + // Documented in function ddoc. + struct Result + { + ubyte[] array; + UTFEncoding encoding; + Endian endian; + uint bytesStripped = 0; + } + Result result; + + // Detect BOM, if any, in the bytes we've read. -1 means no BOM. + // Need the last match: First 2 bytes of UTF-32LE BOM match the UTF-16LE BOM. If we + // used the first match, UTF-16LE would be detected when we have a UTF-32LE BOM. + BOM bomId = BOM.None; + foreach(i, bom; byteOrderMarks) if(array.startsWith(bom)) + { + bomId = cast(BOM)i; + } + + result.endian = (bomId != BOM.None) ? bomEndian[bomId] : Endian.init; + + // Start of UTF data (after BOM, if any) + size_t start = 0; + // If we've read more than just the BOM, put the rest into the array. + with(BOM) final switch(bomId) + { + case None: result.encoding = UTFEncoding.UTF_8; break; + case UTF_8: + start = 3; + result.encoding = UTFEncoding.UTF_8; + break; + case UTF_16_LE, UTF_16_BE: + result.bytesStripped = array.length % 2; + start = 2; + result.encoding = UTFEncoding.UTF_16; + break; + case UTF_32_LE, UTF_32_BE: + result.bytesStripped = array.length % 4; + start = 4; + result.encoding = UTFEncoding.UTF_32; + break; + } + + array = array[0 .. $ - result.bytesStripped]; + // If there's a BOM, we need to move data back to ensure it starts at array[0] + if(start != 0) + { + core.stdc.string.memmove(array.ptr, array.ptr + start, array.length - start); + array = array[0 .. $ - start]; + } + + // We enforce above that array.length is divisible by 2/4 for UTF-16/32 + if(std.system.endian != result.endian) + { + if(result.encoding == UTFEncoding.UTF_16) { swapByteOrder(cast(wchar[])array); } + else if(result.encoding == UTFEncoding.UTF_32) { swapByteOrder(cast(dchar[])array); } + } + + result.array = array; + return result; +}