Improve performance of sanitizeUTF8/readFileUTF8.

Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set.
2019-10-22 11:33:57 +02:00 · 2019-10-22 11:33:57 +02:00 · 2e4bc6a316
parent 81ba969fd6
commit 2e4bc6a316
1 changed files with 13 additions and 17 deletions
--- a/source/vibe/internal/string.d
+++ b/source/vibe/internal/string.d
@ -25,24 +25,20 @@ import core.exception;
 	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
 	the original as possible.
 */
-string sanitizeUTF8(in ubyte[] str)
+string sanitizeUTF8(immutable(ubyte)[] str)
@safe pure {
-	import std.utf;
-	auto ret = appender!string();
-	ret.reserve(str.length);
-
-	size_t i = 0;
-	while (i < str.length) {
-		dchar ch = str[i];
-		try ch = std.utf.decode(cast(const(char[]))str, i);
-		catch( UTFException ){ i++; }
-		//catch( AssertError ){ i++; }
-		char[4] dst;
-		auto len = std.utf.encode(dst, ch);
-		ret.put(dst[0 .. len]);
+	import std.encoding : sanitize;
+	auto ustr = cast(immutable(char)[])str;
+	return () @trusted { return sanitize(ustr); } ();
 }
-
-	return ret.data;
+/// ditto
+string sanitizeUTF8(in ubyte[] str)
+@trusted pure {
+	import std.encoding : sanitize;
+	auto ustr = cast(immutable(char)[])str;
+	auto ret = sanitize(ustr);
+	if (ret.ptr is ustr.ptr) return ustr.idup;
+	else return ret;
 }

 /**