Improve performance of sanitizeUTF8/readFileUTF8.
Uses std.encoding.sanitize, which returns the original string, if all code points are properly encoded. Note that the performance could still be improved considerably by iterating over multiple bytes at once, fast skipping over characters that don't have the most significant bit set.
This commit is contained in:
parent
81ba969fd6
commit
2e4bc6a316
|
@ -25,24 +25,20 @@ import core.exception;
|
||||||
Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
|
Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
|
||||||
the original as possible.
|
the original as possible.
|
||||||
*/
|
*/
|
||||||
string sanitizeUTF8(in ubyte[] str)
|
string sanitizeUTF8(immutable(ubyte)[] str)
|
||||||
@safe pure {
|
@safe pure {
|
||||||
import std.utf;
|
import std.encoding : sanitize;
|
||||||
auto ret = appender!string();
|
auto ustr = cast(immutable(char)[])str;
|
||||||
ret.reserve(str.length);
|
return () @trusted { return sanitize(ustr); } ();
|
||||||
|
}
|
||||||
size_t i = 0;
|
/// ditto
|
||||||
while (i < str.length) {
|
string sanitizeUTF8(in ubyte[] str)
|
||||||
dchar ch = str[i];
|
@trusted pure {
|
||||||
try ch = std.utf.decode(cast(const(char[]))str, i);
|
import std.encoding : sanitize;
|
||||||
catch( UTFException ){ i++; }
|
auto ustr = cast(immutable(char)[])str;
|
||||||
//catch( AssertError ){ i++; }
|
auto ret = sanitize(ustr);
|
||||||
char[4] dst;
|
if (ret.ptr is ustr.ptr) return ustr.idup;
|
||||||
auto len = std.utf.encode(dst, ch);
|
else return ret;
|
||||||
ret.put(dst[0 .. len]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret.data;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in a new issue