Only validate UTF-8 if we get UTF-8 input (UTF-16/32 validated at conversion)

This commit is contained in:
Ferdinand Majerech 2014-07-30 22:29:40 +02:00
parent 6c15bd95cc
commit 4e2c3e6093

View file

@ -121,14 +121,10 @@ final class Reader
buffer8_ = utf8Result.utf8; buffer8_ = utf8Result.utf8;
const validateResult = buffer8_.validateUTF8NoGC; characterCount_ = utf8Result.characterCount;
enforce(validateResult.valid,
new ReaderException(validateResult.msg ~
validateResult.sequence.to!string));
// Check that all characters in buffer are printable. // Check that all characters in buffer are printable.
enforce(isPrintableValidUTF8(buffer8_), enforce(isPrintableValidUTF8(buffer8_),
new ReaderException("Special unicode characters are not allowed")); new ReaderException("Special unicode characters are not allowed"));
characterCount_ = validateResult.characterCount;
this.sliceBuilder = SliceBuilder(this); this.sliceBuilder = SliceBuilder(this);
} }
@ -617,9 +613,10 @@ private:
// A struct with the following members: // A struct with the following members:
// //
// $(D string errorMessage) In case of an error, the error message is stored here. If // $(D string errorMessage) In case of an error, the error message is stored here. If
// there was no error, errorMessage is NULL. Always check this // there was no error, errorMessage is NULL. Always check
// first. // this first.
// $(D char[] utf8) input converted to UTF-8. May be a slice of input. // $(D char[] utf8) input converted to UTF-8. May be a slice of input.
// $(D size_t characterCount) Number of characters (code points) in input.
auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{ {
// Documented in function ddoc. // Documented in function ddoc.
@ -627,6 +624,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{ {
string errorMessage; string errorMessage;
char[] utf8; char[] utf8;
size_t characterCount;
} }
Result result; Result result;
@ -639,9 +637,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
// result = A Result struct to put encoded result and any error messages to. // result = A Result struct to put encoded result and any error messages to.
// //
// On error, result.errorMessage will be set. // On error, result.errorMessage will be set.
static void encode(C)(C[] input, ref Result result) @safe pure nothrow static void encode(C)(C[] input, ref Result result) @safe pure
{
try
{ {
// We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
// less bytes. // less bytes.
@ -652,6 +648,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
auto length = 0; auto length = 0;
foreach(dchar c; input) foreach(dchar c; input)
{ {
++result.characterCount;
// ASCII // ASCII
if(c < 0x80) if(c < 0x80)
{ {
@ -674,20 +671,23 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
// Unfortunately we can't do UTF-16 in place so we just use std.conv.to // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
else else
{ {
result.characterCount = std.utf.count(input);
result.utf8 = input.to!(char[]); result.utf8 = input.to!(char[]);
} }
} }
catch(ConvException e) { result.errorMessage = e.msg; }
catch(UTFException e) { result.errorMessage = e.msg; }
catch(Exception e)
{
assert(false, "Unexpected exception in encode(): " ~ e.msg);
}
}
final switch(encoding) try final switch(encoding)
{ {
case UTFEncoding.UTF_8: result.utf8 = cast(char[])input; break; case UTFEncoding.UTF_8:
result.utf8 = cast(char[])input;
const validateResult = result.utf8.validateUTF8NoGC();
if(!validateResult.valid)
{
result.errorMessage = "UTF-8 validation error: " ~ validateResult.msg ~
validateResult.sequence.to!string;
}
result.characterCount = validateResult.characterCount;
break;
case UTFEncoding.UTF_16: case UTFEncoding.UTF_16:
assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
encode(cast(wchar[])input, result); encode(cast(wchar[])input, result);
@ -697,8 +697,12 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
encode(cast(dchar[])input, result); encode(cast(dchar[])input, result);
break; break;
} }
catch(ConvException e) { result.errorMessage = e.msg; }
if(result.errorMessage !is null) { return result; } catch(UTFException e) { result.errorMessage = e.msg; }
catch(Exception e)
{
assert(false, "Unexpected exception in encode(): " ~ e.msg);
}
return result; return result;
} }