From 2e4bc6a3166d5240518d091dd30dc744cc25cd75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6nke=20Ludwig?= Date: Tue, 22 Oct 2019 11:33:57 +0200 Subject: [PATCH 1/2] Improve performance of sanitizeUTF8/readFileUTF8. Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set. --- source/vibe/internal/string.d | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/source/vibe/internal/string.d b/source/vibe/internal/string.d index f101364..8c3387b 100644 --- a/source/vibe/internal/string.d +++ b/source/vibe/internal/string.d @@ -25,24 +25,20 @@ import core.exception; Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to the original as possible. */ -string sanitizeUTF8(in ubyte[] str) +string sanitizeUTF8(immutable(ubyte)[] str) @safe pure { - import std.utf; - auto ret = appender!string(); - ret.reserve(str.length); - - size_t i = 0; - while (i < str.length) { - dchar ch = str[i]; - try ch = std.utf.decode(cast(const(char[]))str, i); - catch( UTFException ){ i++; } - //catch( AssertError ){ i++; } - char[4] dst; - auto len = std.utf.encode(dst, ch); - ret.put(dst[0 .. len]); - } - - return ret.data; + import std.encoding : sanitize; + auto ustr = cast(immutable(char)[])str; + return () @trusted { return sanitize(ustr); } (); +} +/// ditto +string sanitizeUTF8(in ubyte[] str) +@trusted pure { + import std.encoding : sanitize; + auto ustr = cast(immutable(char)[])str; + auto ret = sanitize(ustr); + if (ret.ptr is ustr.ptr) return ustr.idup; + else return ret; } /** From f56fd7580ce74395aed7d448c490f33328884913 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B6nke=20Ludwig?= Date: Tue, 22 Oct 2019 11:41:35 +0200 Subject: [PATCH 2/2] Avoid reallocating the data in readFileUTF8 if the UTF encoding is valid. Uses the immutable overload of sanitizeUTF8 to reuse the original buffer if possible. --- source/vibe/core/file.d | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/vibe/core/file.d b/source/vibe/core/file.d index 410ddf3..db2a1ea 100644 --- a/source/vibe/core/file.d +++ b/source/vibe/core/file.d @@ -118,7 +118,9 @@ string readFileUTF8(NativePath path) { import vibe.internal.string; - return stripUTF8Bom(sanitizeUTF8(readFile(path))); + auto data = readFile(path); + auto idata = () @trusted { return data.assumeUnique; } (); + return stripUTF8Bom(sanitizeUTF8(idata)); } /// ditto string readFileUTF8(string path)