Only validate UTF-8 if we get UTF-8 input (UTF-16/32 validated at conversion)

This commit is contained in:
Ferdinand Majerech 2014-07-30 22:29:40 +02:00
parent 6c15bd95cc
commit 4e2c3e6093

View file

@ -121,14 +121,10 @@ final class Reader
buffer8_ = utf8Result.utf8; buffer8_ = utf8Result.utf8;
const validateResult = buffer8_.validateUTF8NoGC; characterCount_ = utf8Result.characterCount;
enforce(validateResult.valid,
new ReaderException(validateResult.msg ~
validateResult.sequence.to!string));
// Check that all characters in buffer are printable. // Check that all characters in buffer are printable.
enforce(isPrintableValidUTF8(buffer8_), enforce(isPrintableValidUTF8(buffer8_),
new ReaderException("Special unicode characters are not allowed")); new ReaderException("Special unicode characters are not allowed"));
characterCount_ = validateResult.characterCount;
this.sliceBuilder = SliceBuilder(this); this.sliceBuilder = SliceBuilder(this);
} }
@ -616,10 +612,11 @@ private:
// //
// A struct with the following members: // A struct with the following members:
// //
// $(D string errorMessage) In case of an error, the error message is stored here. If // $(D string errorMessage) In case of an error, the error message is stored here. If
// there was no error, errorMessage is NULL. Always check this // there was no error, errorMessage is NULL. Always check
// first. // this first.
// $(D char[] utf8) input converted to UTF-8. May be a slice of input. // $(D char[] utf8) input converted to UTF-8. May be a slice of input.
// $(D size_t characterCount) Number of characters (code points) in input.
auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{ {
// Documented in function ddoc. // Documented in function ddoc.
@ -627,6 +624,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{ {
string errorMessage; string errorMessage;
char[] utf8; char[] utf8;
size_t characterCount;
} }
Result result; Result result;
@ -639,55 +637,57 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
// result = A Result struct to put encoded result and any error messages to. // result = A Result struct to put encoded result and any error messages to.
// //
// On error, result.errorMessage will be set. // On error, result.errorMessage will be set.
static void encode(C)(C[] input, ref Result result) @safe pure nothrow static void encode(C)(C[] input, ref Result result) @safe pure
{ {
try // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
// less bytes.
static if(is(C == dchar))
{ {
// We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or char[4] encodeBuf;
// less bytes. auto utf8 = cast(char[])input;
static if(is(C == dchar)) auto length = 0;
foreach(dchar c; input)
{ {
char[4] encodeBuf; ++result.characterCount;
auto utf8 = cast(char[])input; // ASCII
auto length = 0; if(c < 0x80)
foreach(dchar c; input)
{ {
// ASCII utf8[length++] = cast(char)c;
if(c < 0x80) continue;
{
utf8[length++] = cast(char)c;
continue;
}
const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
if(encodeResult.errorMessage !is null)
{
result.errorMessage = encodeResult.errorMessage;
return;
}
const bytes = encodeResult.bytes;
utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
length += bytes;
} }
result.utf8 = utf8[0 .. length];
} const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
// Unfortunately we can't do UTF-16 in place so we just use std.conv.to if(encodeResult.errorMessage !is null)
else {
{ result.errorMessage = encodeResult.errorMessage;
result.utf8 = input.to!(char[]); return;
}
const bytes = encodeResult.bytes;
utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
length += bytes;
} }
result.utf8 = utf8[0 .. length];
} }
catch(ConvException e) { result.errorMessage = e.msg; } // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
catch(UTFException e) { result.errorMessage = e.msg; } else
catch(Exception e)
{ {
assert(false, "Unexpected exception in encode(): " ~ e.msg); result.characterCount = std.utf.count(input);
result.utf8 = input.to!(char[]);
} }
} }
final switch(encoding) try final switch(encoding)
{ {
case UTFEncoding.UTF_8: result.utf8 = cast(char[])input; break; case UTFEncoding.UTF_8:
result.utf8 = cast(char[])input;
const validateResult = result.utf8.validateUTF8NoGC();
if(!validateResult.valid)
{
result.errorMessage = "UTF-8 validation error: " ~ validateResult.msg ~
validateResult.sequence.to!string;
}
result.characterCount = validateResult.characterCount;
break;
case UTFEncoding.UTF_16: case UTFEncoding.UTF_16:
assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
encode(cast(wchar[])input, result); encode(cast(wchar[])input, result);
@ -697,8 +697,12 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
encode(cast(dchar[])input, result); encode(cast(dchar[])input, result);
break; break;
} }
catch(ConvException e) { result.errorMessage = e.msg; }
if(result.errorMessage !is null) { return result; } catch(UTFException e) { result.errorMessage = e.msg; }
catch(Exception e)
{
assert(false, "Unexpected exception in encode(): " ~ e.msg);
}
return result; return result;
} }