UTF-8 validation now uses UTF-8 decoding code.
This commit is contained in:
parent
b5da695d6b
commit
7b699c5903
|
@ -233,20 +233,12 @@ struct ValidateResult
|
||||||
/// Is the validated string valid?
|
/// Is the validated string valid?
|
||||||
bool valid;
|
bool valid;
|
||||||
/// Number of characters in the string.
|
/// Number of characters in the string.
|
||||||
|
///
|
||||||
|
/// If the string is not valid, this is the number of valid characters before
|
||||||
|
/// hitting the first invalid sequence.
|
||||||
size_t characterCount;
|
size_t characterCount;
|
||||||
/// If the string is not valid, error message with details is here.
|
/// If the string is not valid, error message with details is here.
|
||||||
string msg;
|
string msg;
|
||||||
/// If the string is not valid, the first invalid sequence of bytes is here.
|
|
||||||
const(uint)[] sequence() @safe pure nothrow const @nogc
|
|
||||||
{
|
|
||||||
return sequenceBuffer[0 .. sequenceLength];
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Buffer for the invalid sequence of bytes if valid == false.
|
|
||||||
uint[4] sequenceBuffer;
|
|
||||||
// Number of used bytes in sequenceBuffer.
|
|
||||||
size_t sequenceLength;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Validate a UTF-8 string, checking if it is well-formed Unicode.
|
/// Validate a UTF-8 string, checking if it is well-formed Unicode.
|
||||||
|
@ -265,88 +257,12 @@ ValidateResult validateUTF8NoGC(const(char[]) str) @trusted pure nothrow @nogc
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following encodings are valid, except for the 5 and 6 byte combinations:
|
auto decoded = decodeUTF8NoGC!(No.validated)(str, index);
|
||||||
// 0xxxxxxx
|
if(decoded.errorMessage !is null)
|
||||||
// 110xxxxx 10xxxxxx
|
|
||||||
// 1110xxxx 10xxxxxx 10xxxxxx
|
|
||||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
||||||
// 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
||||||
// 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
||||||
|
|
||||||
// Dchar bitmask for different numbers of UTF-8 code units.
|
|
||||||
import std.typecons;
|
|
||||||
enum bitMask = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
|
|
||||||
auto pstr = str.ptr + index;
|
|
||||||
immutable length = str.length - index;
|
|
||||||
ubyte fst = pstr[0];
|
|
||||||
|
|
||||||
static ValidateResult error(const(char[]) str, string msg) @safe pure nothrow @nogc
|
|
||||||
{
|
{
|
||||||
ValidateResult result;
|
return ValidateResult(false, characterCount, decoded.errorMessage);
|
||||||
size_t i;
|
|
||||||
|
|
||||||
do
|
|
||||||
{
|
|
||||||
result.sequenceBuffer[i] = str[i];
|
|
||||||
} while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
|
|
||||||
|
|
||||||
result.valid = false;
|
|
||||||
result.msg = msg;
|
|
||||||
result.sequenceLength = i;
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
++characterCount;
|
||||||
static ValidateResult invalidUTF(const(char[]) str) @safe pure nothrow @nogc
|
|
||||||
{
|
|
||||||
return error(str, "Invalid UTF-8 sequence");
|
|
||||||
}
|
|
||||||
static ValidateResult outOfBounds(const(char[]) str) @safe pure nothrow @nogc
|
|
||||||
{
|
|
||||||
return error(str, "Attempted to decode past the end of a string");
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(fst & 0x80);
|
|
||||||
// starter must have at least 2 first bits set
|
|
||||||
if((fst & 0b1100_0000) != 0b1100_0000) { return invalidUTF(pstr[0 .. length]); }
|
|
||||||
ubyte tmp = void;
|
|
||||||
dchar d = fst; // upper control bits are masked out later
|
|
||||||
fst <<= 1;
|
|
||||||
|
|
||||||
foreach (i; TypeTuple!(1, 2, 3))
|
|
||||||
{
|
|
||||||
if(i == length) { return outOfBounds(pstr[0 .. length]); }
|
|
||||||
|
|
||||||
tmp = pstr[i];
|
|
||||||
|
|
||||||
if ((tmp & 0xC0) != 0x80) { return invalidUTF(pstr[0 .. length]); }
|
|
||||||
|
|
||||||
d = (d << 6) | (tmp & 0x3F);
|
|
||||||
fst <<= 1;
|
|
||||||
|
|
||||||
if (!(fst & 0x80)) // no more bytes
|
|
||||||
{
|
|
||||||
d &= bitMask[i]; // mask out control bits
|
|
||||||
|
|
||||||
// overlong, could have been encoded with i bytes
|
|
||||||
if ((d & ~bitMask[i - 1]) == 0) { return invalidUTF(pstr[0 .. length]); }
|
|
||||||
|
|
||||||
// check for surrogates only needed for 3 bytes
|
|
||||||
static if(i == 2)
|
|
||||||
{
|
|
||||||
if (!isValidDchar(d)) { return invalidUTF(pstr[0 .. length]); }
|
|
||||||
}
|
|
||||||
|
|
||||||
++characterCount;
|
|
||||||
index += i + 1;
|
|
||||||
static if(i == 3)
|
|
||||||
{
|
|
||||||
if (d > dchar.max) { return invalidUTF(pstr[0 .. length]); }
|
|
||||||
}
|
|
||||||
continue outer;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return invalidUTF(pstr[0 .. length]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ValidateResult(true, characterCount);
|
return ValidateResult(true, characterCount);
|
||||||
|
|
|
@ -682,8 +682,9 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
|
||||||
const validateResult = result.utf8.validateUTF8NoGC();
|
const validateResult = result.utf8.validateUTF8NoGC();
|
||||||
if(!validateResult.valid)
|
if(!validateResult.valid)
|
||||||
{
|
{
|
||||||
result.errorMessage = "UTF-8 validation error: " ~ validateResult.msg ~
|
result.errorMessage = "UTF-8 validation error after character #" ~
|
||||||
validateResult.sequence.to!string;
|
validateResult.characterCount.to!string ~ ": " ~
|
||||||
|
validateResult.msg;
|
||||||
}
|
}
|
||||||
result.characterCount = validateResult.characterCount;
|
result.characterCount = validateResult.characterCount;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in a new issue